Beispiel #1
0
    def fit(self, X, y=None, **fit_params):
        """Train the self-organizing map.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
        """

        # Check and normalize input data
        X = minmax_scale(check_array(X, dtype=np.float32))

        # Initialize Somoclu object
        if not hasattr(self, 'algorithm_'):

            # Set number of columns and rows from number of clusters
            if self.n_clusters is not None:
                self.n_columns_ = self.n_rows_ = int(self.n_clusters *
                                                     (np.sqrt(len(X)) - 2) + 2)
            else:
                self.n_columns_, self.n_rows_ = self.n_columns, self.n_rows

            # Create object
            self.algorithm_ = Somoclu(n_columns=self.n_columns_,
                                      n_rows=self.n_rows_,
                                      initialcodebook=self.initialcodebook,
                                      kerneltype=self.kerneltype,
                                      maptype=self.maptype,
                                      gridtype=self.gridtype,
                                      compactsupport=self.compactsupport,
                                      neighborhood=self.neighborhood,
                                      std_coeff=self.std_coeff,
                                      initialization=self.initialization,
                                      data=None,
                                      verbose=self.verbose)

        # Fit Somoclu
        self.algorithm_.train(data=X, **fit_params)

        # Grid labels
        grid_labels = [
            tuple(grid_label) for grid_label in self.algorithm_.bmus
        ]

        # Generate labels mapping
        labels_mapping = self._generate_labels_mapping(grid_labels)

        # Generate cluster labels
        self.labels_ = np.array(
            [labels_mapping[grid_label] for grid_label in grid_labels])

        # Generate labels neighbors
        self.neighbors_ = self._generate_neighbors(grid_labels, labels_mapping)

        return self
Beispiel #2
0
 def test_deterministic_codebook(self):
     n_rows, n_columns = 2, 2
     codebook = np.zeros((2*2, 2), dtype=np.float32)
     data = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
     som = Somoclu(n_columns, n_rows, data=data, initialcodebook=codebook,
                   compactsupport=False)
     som.train()
     correct_codebook = np.array([[[ 0.2       ,  0.30000001],
                                   [ 0.10359724,  0.20359723]],
                                  [[ 0.29640275,  0.39640275],
                                   [ 0.2       ,  0.30000001]]], dtype=np.float32)
     self.assertTrue(sum(codebook.reshape((n_rows*n_columns*2)) -
                         correct_codebook.reshape((n_rows*n_columns*2))) < 10e-8)
Beispiel #3
0
 def test_deterministic_codebook(self):
     n_rows, n_columns = 2, 2
     codebook = np.zeros((2*2, 2), dtype=np.float32)
     data = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
     som = Somoclu(n_columns, n_rows, initialcodebook=codebook,
                   compactsupport=False)
     som.train(data)
     correct_codebook = np.array([[[ 0.2       ,  0.30000001],
                                   [ 0.10359724,  0.20359723]],
                                  [[ 0.29640275,  0.39640275],
                                   [ 0.2       ,  0.30000001]]], dtype=np.float32)
     self.assertTrue(sum(codebook.reshape((n_rows*n_columns*2)) -
                         correct_codebook.reshape((n_rows*n_columns*2))) < 10e-8)
Beispiel #4
0
class SOM(BaseEstimator, ClusterMixin):
    """Class for training and visualizing a self-organizing map.

    Parameters
    ----------

    n_columns : int, default: 5
        The number of columns in the map.

    n_rows : int, default: 5
        The number of rows in the map.

    n_clusters : float, default: None
        The proportion of clusters relative to the number of samples of the input 
        space. If this is not None then `n_columns` and `n_rows` are ignored.

    initialcodebook : 2D numpy.array of float32 or None, default: None
        Define the codebook to start the training.

    kerneltype : int, default: 0
        Specify which kernel to use. 
        
        0 for dense CPU kernel.
        
        1 for dense GPU kernel if compiled with it.

    maptype : str, default: "planar" 
        Specify the map topology. 
        
        "planar" for planar map.
        
        "toroid" for toroid map.

    gridtype : str, default: "rectangular"
        Specify the grid form of the nodes. 
        
        "rectangular" for rectangular neurons.
        
        "hexagonal" for hexagonal neurons.

    compactsupport : bool, default: True 
        Cut off map updates beyond the training radius with the Gaussian neighborhood.
                           
    neighborhood : str, default: "gaussian" 
        Specify the neighborhood.
        
        "gaussian" for Gaussian neighborhood.
        
        "bubble" for bubble neighborhood function.

    std_coeff : float, default: 0.5
        Set the coefficient in the Gaussian neighborhood function exp(-||x-y||^2/(2*(coeff*radius)^2)).
    
    initialization : str or None, default: None 
        Specify the codebook initalization.
        
        "random" for random weights in the codebook.
        
        "pca": codebook is initialized from the first subspace spanned by the first 
        two eigenvectors of the correlation matrix.

    verbose : int, default: 0 
        Specify verbosity level (0, 1, or 2).
    """

    _attributes = ['train', 'codebook', 'bmus']

    def __init__(self,
                 n_columns=5,
                 n_rows=5,
                 n_clusters=None,
                 initialcodebook=None,
                 kerneltype=0,
                 maptype="planar",
                 gridtype="rectangular",
                 compactsupport=True,
                 neighborhood="gaussian",
                 std_coeff=0.5,
                 initialization=None,
                 verbose=0):

        self.n_columns = n_columns
        self.n_rows = n_rows
        self.n_clusters = n_clusters
        self.initialcodebook = initialcodebook
        self.kerneltype = kerneltype
        self.maptype = maptype
        self.gridtype = gridtype
        self.compactsupport = compactsupport
        self.neighborhood = neighborhood
        self.std_coeff = std_coeff
        self.initialization = initialization
        self.verbose = verbose

    @staticmethod
    def _generate_labels_mapping(grid_labels):
        """Generate a mapping between grid labels and cluster labels."""

        # Identify unique grid labels
        unique_labels = [
            tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0)
        ]

        # Generate mapping
        labels_mapping = {
            grid_label: cluster_label
            for grid_label, cluster_label in zip(unique_labels,
                                                 range(len(unique_labels)))
        }

        return labels_mapping

    def _return_topological_neighbors(self, col, row):
        """Return the topological neighbors of a neuron."""

        # Return common topological neighbors for the two grid types
        topological_neighbors = [(col - 1, row), (col + 1, row),
                                 (col, row - 1), (col, row + 1)]

        # Append extra topological neighbors for hexagonal grid type
        if self.gridtype == 'hexagonal':
            offset = (-1)**row
            topological_neighbors += [(col - offset, row - offset),
                                      (col - offset, row + offset)]

        # Apply constraints
        topological_neighbors = [
            (col, row) for col, row in topological_neighbors
            if 0 <= col < self.n_columns_ and 0 <= row < self.n_rows_
            and [col, row] in self.algorithm_.bmus.tolist()
        ]

        return topological_neighbors

    def _generate_neighbors(self, grid_labels, labels_mapping):
        """Generate pairs of neighboring labels."""

        # Generate grid topological neighbors
        grid_topological_neighbors = [
            product([grid_label],
                    self._return_topological_neighbors(*grid_label))
            for grid_label in grid_labels
        ]

        # Flatten grid topological neighbors
        grid_topological_neighbors = [
            pair for pairs in grid_topological_neighbors for pair in pairs
        ]

        # Generate cluster neighbors
        all_neighbors = [(labels_mapping[pair[0]], labels_mapping[pair[1]])
                         for pair in grid_topological_neighbors]
        all_neighbors = [
            tuple(pair) for pair in np.unique(all_neighbors, axis=0)
        ]

        # Keep unique unordered pairs
        neighbors = []
        for pair in all_neighbors:
            if pair not in neighbors and pair[::-1] not in neighbors:
                neighbors.append(pair)

        return neighbors

    def fit(self, X, y=None, **fit_params):
        """Train the self-organizing map.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
        """

        # Check and normalize input data
        X = minmax_scale(check_array(X, dtype=np.float32))

        # Initialize Somoclu object
        if not hasattr(self, 'algorithm_'):

            # Set number of columns and rows from number of clusters
            if self.n_clusters is not None:
                self.n_columns_ = self.n_rows_ = int(self.n_clusters *
                                                     (np.sqrt(len(X)) - 2) + 2)
            else:
                self.n_columns_, self.n_rows_ = self.n_columns, self.n_rows

            # Create object
            self.algorithm_ = Somoclu(n_columns=self.n_columns_,
                                      n_rows=self.n_rows_,
                                      initialcodebook=self.initialcodebook,
                                      kerneltype=self.kerneltype,
                                      maptype=self.maptype,
                                      gridtype=self.gridtype,
                                      compactsupport=self.compactsupport,
                                      neighborhood=self.neighborhood,
                                      std_coeff=self.std_coeff,
                                      initialization=self.initialization,
                                      data=None,
                                      verbose=self.verbose)

        # Fit Somoclu
        self.algorithm_.train(data=X, **fit_params)

        # Grid labels
        grid_labels = [
            tuple(grid_label) for grid_label in self.algorithm_.bmus
        ]

        # Generate labels mapping
        labels_mapping = self._generate_labels_mapping(grid_labels)

        # Generate cluster labels
        self.labels_ = np.array(
            [labels_mapping[grid_label] for grid_label in grid_labels])

        # Generate labels neighbors
        self.neighbors_ = self._generate_neighbors(grid_labels, labels_mapping)

        return self

    def fit_predict(self, X, y=None):
        """Train the self-organizing map and assign a cluster label to each sample.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        u : Ignored

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        return self.fit(X).labels_
Beispiel #5
0
    def fit(self, X, y=None, **fit_params):
        """Train the self-organizing map.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
        """

        # Check and normalize input data
        X = minmax_scale(check_array(X, dtype=np.float32))

        # Check random_state
        self.random_state_ = check_random_state(self.random_state)

        # Initialize codebook
        if self.initialcodebook is None:
            if self.random_state is None:
                initialcodebook = None
                initialization = 'random'
            else:
                codebook_size = self.n_columns * self.n_rows * X.shape[1]
                initialcodebook = self.random_state_.random_sample(
                    codebook_size).astype(np.float32)
                initialization = None
        elif self.initialcodebook == 'pca':
            initialcodebook = None
            initialization = 'random'
        else:
            initialcodebook = self.initialcodebook
            initialization = None

        # Create Somoclu object
        self.algorithm_ = Somoclu(
            n_columns=self.n_columns,
            n_rows=self.n_rows,
            initialcodebook=initialcodebook,
            kerneltype=self.kerneltype,
            maptype=self.maptype,
            gridtype=self.gridtype,
            compactsupport=self.compactsupport,
            neighborhood=self.neighborhood,
            std_coeff=self.std_coeff,
            initialization=initialization,
            data=None,
            verbose=self.verbose,
        )

        # Fit Somoclu
        self.algorithm_.train(data=X, **fit_params)

        # Grid labels
        grid_labels = [
            tuple(grid_label) for grid_label in self.algorithm_.bmus
        ]

        # Generate labels mapping
        self.labels_mapping_ = self._generate_labels_mapping(grid_labels)

        # Generate cluster labels
        self.labels_ = np.array(
            [self.labels_mapping_[grid_label] for grid_label in grid_labels])

        # Generate labels neighbors
        self.neighbors_ = self._generate_neighbors(
            np.unique(grid_labels, axis=0), self.labels_mapping_)

        return self
Beispiel #6
0
class SOM(BaseEstimator, ClusterMixin):
    """Class to fit and visualize a Self-Organizing Map (SOM).

    The implementation uses SOM from Somoclu.

    Read more in the :ref:`User Guide <user_guide>`.

    Parameters
    ----------

    n_columns : int, optional (default=5)
        The number of columns in the map.

    n_rows : int, optional (default=5)
        The number of rows in the map.

    initialcodebook : 2D numpy.array of float32, str or None, optional (default=None)
        Define the codebook to start the training. If ``initialcodebook='pca'`` then
        the codebook is initialized from the first subspace spanned by the first two
        eigenvectors of the correlation matrix.

    kerneltype : int, optional (default=0)
        Specify which kernel to use. If ``kerneltype=0`` use dense CPU kernel.
        Else if ``kerneltype=1`` use dense GPU kernel if compiled with it.

    maptype : str, optional (default='planar')
        Specify the map topology. If ``maptype='planar'`` use planar map.
        Else if ``maptype='toroid'`` use toroid map.

    gridtype : str, optional (default='rectangular')
        Specify the grid form of the nodes. If ``gridtype='rectangular'``
        use rectangular neurons. Else if ``gridtype='hexagonal'`` use
        hexagonal neurons.

    compactsupport : bool, optional (default=True)
        Cut off map updates beyond the training radius with the Gaussian neighborhood.

    neighborhood : str, optional (default='gaussian')
        Specify the neighborhood. If ``neighborhood='gaussian'`` use
        Gaussian neighborhood. Else if `neighborhood='bubble'`` use
        bubble neighborhood function.

    std_coeff : float, optional (default=0.5)
        Set the coefficient in the Gaussian
        neighborhood :math:`exp(-||x-y||^2/(2*(coeff*radius)^2))`.

    random_state : int, RandomState instance or None, optional (default=None)
        Control the randomization of the algorithm by specifying the
        codebook initalization. It is ignored when ``initialcodebook`` is
        not ``None``.

        - If int, ``random_state`` is the seed used by the random number
          generator.
        - If ``RandomState`` instance, random_state is the random number
          generator.
        - If ``None``, the random number generator is the ``RandomState``
          instance used by ``np.random``.

    verbose : int, optional (default=0)
        Specify verbosity level (0, 1, or 2).

    """

    _attributes = ['train', 'codebook', 'bmus']

    def __init__(
        self,
        n_columns=5,
        n_rows=5,
        initialcodebook=None,
        kerneltype=0,
        maptype="planar",
        gridtype="rectangular",
        compactsupport=True,
        neighborhood="gaussian",
        std_coeff=0.5,
        random_state=None,
        verbose=0,
    ):

        self.n_columns = n_columns
        self.n_rows = n_rows
        self.initialcodebook = initialcodebook
        self.kerneltype = kerneltype
        self.maptype = maptype
        self.gridtype = gridtype
        self.compactsupport = compactsupport
        self.neighborhood = neighborhood
        self.std_coeff = std_coeff
        self.random_state = random_state
        self.verbose = verbose

    @staticmethod
    def _generate_labels_mapping(grid_labels):
        """Generate a mapping between grid labels and cluster labels."""

        # Identify unique grid labels
        unique_labels = [
            tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0)
        ]

        # Generate mapping
        labels_mapping = {
            grid_label: cluster_label
            for grid_label, cluster_label in zip(unique_labels,
                                                 range(len(unique_labels)))
        }

        return labels_mapping

    def _return_topological_neighbors(self, col, row):
        """Return the topological neighbors of a neuron."""

        # Return common topological neighbors for the two grid types
        topological_neighbors = [
            (col - 1, row),
            (col + 1, row),
            (col, row - 1),
            (col, row + 1),
        ]

        # Append extra topological neighbors for hexagonal grid type
        if self.gridtype == 'hexagonal':
            offset = (-1)**row
            topological_neighbors += [
                (col - offset, row - offset),
                (col - offset, row + offset),
            ]

        # Apply constraints
        topological_neighbors = [
            (col, row) for col, row in topological_neighbors
            if 0 <= col < self.n_columns and 0 <= row < self.n_rows
            and [col, row] in self.algorithm_.bmus.tolist()
        ]

        return topological_neighbors

    def _generate_neighbors(self, grid_labels, labels_mapping):
        """Generate pairs of neighboring labels."""

        # Generate grid topological neighbors
        grid_topological_neighbors = [
            product([tuple(grid_label)],
                    self._return_topological_neighbors(*grid_label))
            for grid_label in grid_labels
        ]

        # Flatten grid topological neighbors
        grid_topological_neighbors = [
            pair for pairs in grid_topological_neighbors for pair in pairs
        ]

        # Generate cluster neighbors
        all_neighbors = [(labels_mapping[pair[0]], labels_mapping[pair[1]])
                         for pair in grid_topological_neighbors]
        all_neighbors = [
            tuple(pair) for pair in np.unique(all_neighbors, axis=0)
        ]

        # Keep unique unordered pairs
        neighbors = []
        for pair in all_neighbors:
            if pair not in neighbors and pair[::-1] not in neighbors:
                neighbors.append(pair)

        return np.array(neighbors)

    def fit(self, X, y=None, **fit_params):
        """Train the self-organizing map.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
        """

        # Check and normalize input data
        X = minmax_scale(check_array(X, dtype=np.float32))

        # Check random_state
        self.random_state_ = check_random_state(self.random_state)

        # Initialize codebook
        if self.initialcodebook is None:
            if self.random_state is None:
                initialcodebook = None
                initialization = 'random'
            else:
                codebook_size = self.n_columns * self.n_rows * X.shape[1]
                initialcodebook = self.random_state_.random_sample(
                    codebook_size).astype(np.float32)
                initialization = None
        elif self.initialcodebook == 'pca':
            initialcodebook = None
            initialization = 'random'
        else:
            initialcodebook = self.initialcodebook
            initialization = None

        # Create Somoclu object
        self.algorithm_ = Somoclu(
            n_columns=self.n_columns,
            n_rows=self.n_rows,
            initialcodebook=initialcodebook,
            kerneltype=self.kerneltype,
            maptype=self.maptype,
            gridtype=self.gridtype,
            compactsupport=self.compactsupport,
            neighborhood=self.neighborhood,
            std_coeff=self.std_coeff,
            initialization=initialization,
            data=None,
            verbose=self.verbose,
        )

        # Fit Somoclu
        self.algorithm_.train(data=X, **fit_params)

        # Grid labels
        grid_labels = [
            tuple(grid_label) for grid_label in self.algorithm_.bmus
        ]

        # Generate labels mapping
        self.labels_mapping_ = self._generate_labels_mapping(grid_labels)

        # Generate cluster labels
        self.labels_ = np.array(
            [self.labels_mapping_[grid_label] for grid_label in grid_labels])

        # Generate labels neighbors
        self.neighbors_ = self._generate_neighbors(
            np.unique(grid_labels, axis=0), self.labels_mapping_)

        return self

    def fit_predict(self, X, y=None, **fit_params):
        """Train the self-organizing map and assign a cluster label to each sample.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        u : Ignored

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        return self.fit(X, **fit_params).labels_