class TrainTestSplitter(object):
    """
    A generic class for splitting data into (random) subsets.

    Parameters
    ----------
    shuffle : bool, optional
        Whether to shuffle the data.
    random_seed : None or int, optional
        Pseudo-random number generator seed used for random sampling.

    Examples
    --------
    >>> import numpy as np
    >>> y = np.array([1, 1, 2, 2, 3, 3, 3])

    >>> tts1 = TrainTestSplitter(shuffle=False)
    >>> train, test = tts1.split(y, train_ratio=0.5)
    >>> print y[train], y[test]
    [1 1 2] [2 3 3 3]
    >>> train, test = tts1.split(y, train_ratio=0.5, stratify=True)
    >>> print y[train], y[test]
    [1 2 3] [1 2 3 3]
    >>> for fold in tts1.make_k_folds(y, n_folds=3):
    ...     print y[fold]
    [1 1 2]
    [2 3]
    [3 3]
    >>> for fold in tts1.make_k_folds(y, n_folds=3, stratify=True):
    ...     print y[fold]
    [1 2 3]
    [1 2 3]
    [3]
    >>> for train, test in tts1.k_fold_split(y, n_splits=3):
    ...     print y[train], y[test]
    [2 3 3 3] [1 1 2]
    [1 1 2 3 3] [2 3]
    [1 1 2 2 3] [3 3]
    >>> for train, test in tts1.k_fold_split(y, n_splits=3, stratify=True):
    ...     print y[train], y[test]
    [1 2 3 3] [1 2 3]
    [1 2 3 3] [1 2 3]
    [1 2 3 1 2 3] [3]

    >>> tts2 = TrainTestSplitter(shuffle=True, random_seed=1337)
    >>> train, test = tts2.split(y, train_ratio=0.5)
    >>> print y[train], y[test]
    [3 2 1] [2 1 3 3]
    >>> train, test = tts2.split(y, train_ratio=0.5, stratify=True)
    >>> print y[train], y[test]
    [3 1 2] [3 3 2 1]
    >>> for fold in tts2.make_k_folds(y, n_folds=3):
    ...     print y[fold]
    [3 2 1]
    [2 1]
    [3 3]
    >>> for fold in tts2.make_k_folds(y, n_folds=3, stratify=True):
    ...     print y[fold]
    [3 1 2]
    [3 2 1]
    [3]
    """
    def __init__(self, shuffle=False, random_seed=None):
        self.shuffle = shuffle
        self.random_seed = random_seed
        self.rng = RNG(self.random_seed)

    def split(self, y, train_ratio=0.8, stratify=False):
        """
        Split data into train and test subsets.

        Parameters
        ----------
        y : (n_samples,) array-like
            The target variable for supervised learning problems.
        train_ratio : float, 0 < `train_ratio` < 1, optional
            the proportion of the dataset to include in the train split.
        stratify : bool, optional
            If True, the folds are made by preserving the percentage of samples
            for each class. Stratification is done based upon the `y` labels.

        Returns
        -------
        train : (n_train,) np.ndarray
            The training set indices for that split.
        test : (n_samples - n_train,) np.ndarray
            The testing set indices for that split.
        """
        self.rng.reseed()
        n = len(y)

        if not stratify:
            indices = self.rng.permutation(n) if self.shuffle else np.arange(
                n, dtype=np.int)
            train_size = int(train_ratio * n)
            return np.split(indices, (train_size, ))

        # group indices by label
        labels_indices = {}
        for index, label in enumerate(y):
            if not label in labels_indices: labels_indices[label] = []
            labels_indices[label].append(index)

        train, test = np.array([], dtype=np.int), np.array([], dtype=np.int)
        for label, indices in sorted(labels_indices.items()):
            size = int(train_ratio * len(indices))
            train = np.concatenate((train, indices[:size]))
            test = np.concatenate((test, indices[size:]))

        if self.shuffle:
            self.rng.shuffle(train)
            self.rng.shuffle(test)

        return train, test

    def make_k_folds(self, y, n_folds=3, stratify=False):
        """
        Split data into folds of (approximately) equal size.

        Parameters
        ----------
        y : (n_samples,) array-like
            The target variable for supervised learning problems.
            Stratification is done based upon the `y` labels.
        n_folds : int, `n_folds` > 1, optional
            Number of folds.
        stratify : bool, optional
            If True, the folds are made by preserving the percentage of samples
            for each class. Stratification is done based upon the `y` labels.

        Yields
        ------
        fold : np.ndarray
            Indices for current fold.
        """
        self.rng.reseed()
        n = len(y)

        if not stratify:
            indices = self.rng.permutation(n) if self.shuffle else np.arange(
                n, dtype=np.int)
            for fold in np.array_split(indices, n_folds):
                yield fold
            return

        # group indices
        labels_indices = {}
        for index, label in enumerate(y):
            if isinstance(label, np.ndarray):
                label = tuple(label.tolist())
            if not label in labels_indices:
                labels_indices[label] = []
            labels_indices[label].append(index)

        # split all indices label-wisely
        for label, indices in sorted(labels_indices.items()):
            labels_indices[label] = np.array_split(indices, n_folds)

        # collect respective splits into folds and shuffle if needed
        for k in xrange(n_folds):
            fold = np.concatenate(
                [indices[k] for _, indices in sorted(labels_indices.items())])
            if self.shuffle:
                self.rng.shuffle(fold)
            yield fold

    def k_fold_split(self, y, n_splits=3, stratify=False):
        """
        Split data into train and test subsets for K-fold CV.

        Parameters
        ----------
        y : (n_samples,) array-like
            The target variable for supervised learning problems.
            Stratification is done based upon the `y` labels.
        n_splits : int, `n_splits` > 1, optional
            Number of folds.
        stratify : bool, optional
            If True, the folds are made by preserving the percentage of samples
            for each class. Stratification is done based upon the `y` labels.

        Yields
        ------
        train : (n_train,) np.ndarray
            The training set indices for current split.
        test : (n_samples - n_train,) np.ndarray
            The testing set indices for current split.
        """
        folds = list(self.make_k_folds(y, n_folds=n_splits, stratify=stratify))
        for i in xrange(n_splits):
            yield np.concatenate(folds[:i] + folds[(i + 1):]), folds[i]
Ejemplo n.º 2
0
class RBM(BaseEstimator):
    """
    Examples
    --------
    >>> X = RNG(seed=1337).rand(32, 256)
    >>> rbm = RBM(n_hidden=100,
    ...           k=4,
    ...           batch_size=2,
    ...           n_epochs=50,
    ...           learning_rate='0.05->0.005',
    ...           momentum='0.5->0.9',
    ...           verbose=True,
    ...           early_stopping=5,
    ...           random_seed=1337)
    >>> rbm
    RBM(W=None, batch_size=2, best_W=None, best_epoch=None, best_hb=None,
      best_recon=inf, best_vb=None, early_stopping=5, epoch=0, hb=None, k=4,
      learning_rate='0.05->0.005', momentum='0.5->0.9', n_epochs=50,
      n_hidden=100, persistent=True, random_seed=1337, vb=None, verbose=True)
    """
    def __init__(self,
                 n_hidden=256,
                 persistent=True,
                 k=1,
                 batch_size=10,
                 n_epochs=10,
                 learning_rate=0.1,
                 momentum=0.9,
                 early_stopping=None,
                 verbose=False,
                 random_seed=None):
        self.n_hidden = n_hidden
        self.persistent = persistent
        self.k = k  # k in CD-k / PCD-k
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.learning_rate = learning_rate
        self._learning_rate = None
        self.momentum = momentum
        self._momentum = None
        self.early_stopping = early_stopping
        self._early_stopping = self.early_stopping
        self.verbose = verbose
        self.random_seed = random_seed

        self.W = None
        self.vb = None  # visible units bias
        self.hb = None  # hidden units bias
        self.epoch = 0

        self.best_W = None
        self.best_vb = None
        self.best_hb = None
        self.best_epoch = None
        self.best_recon = np.inf

        self._dW = None
        self._dvb = None
        self._dhb = None

        self._rng = None
        self._persistent = None
        self._initialized = False
        super(RBM, self).__init__(_y_required=False)

    def propup(self, v):
        """Propagate visible units activation upwards to the hidden units."""
        z = np.dot(v, self.W) + self.hb
        return sigmoid(z)

    def sample_h_given_v(self, v0_sample):
        """Infer state of hidden units given visible units."""
        h1_mean = self.propup(v0_sample)
        h1_sample = self._rng.binomial(size=h1_mean.shape, n=1, p=h1_mean)
        return h1_mean, h1_sample

    def propdown(self, h):
        """Propagate hidden units activation downwards to the visible units."""
        z = np.dot(h, self.W.T) + self.vb
        return sigmoid(z)

    def sample_v_given_h(self, h0_sample):
        """Infer state of visible units given hidden units."""
        v1_mean = self.propdown(h0_sample)
        v1_sample = self._rng.binomial(size=v1_mean.shape, n=1, p=v1_mean)
        return v1_mean, v1_sample

    def gibbs_hvh(self, h0_sample):
        """Performs a step of Gibbs sampling starting from the hidden units."""
        v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
        h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return v1_mean, v1_sample, h1_mean, h1_sample

    def gibbs_vhv(self, v0_sample):
        """Performs a step of Gibbs sampling starting from the visible units."""
        raise NotImplementedError()

    def free_energy(self, v_sample):
        """Function to compute the free energy."""
        raise NotImplementedError()

    def update(self, X_batch):
        # compute positive phase
        ph_mean, ph_sample = self.sample_h_given_v(X_batch)

        # decide how to initialize chain
        if self._persistent is not None:
            chain_start = self._persistent
        else:
            chain_start = ph_sample

        # gibbs sampling
        for step in xrange(self.k):
            nv_means, nv_samples, \
            nh_means, nh_samples = self.gibbs_hvh(chain_start if step == 0 else nh_samples)

        # update weights
        self._dW  = self._momentum * self._dW + \
                    np.dot(X_batch.T, ph_mean) - np.dot(nv_samples.T, nh_means)
        self._dvb = self._momentum * self._dvb +\
                    np.mean(X_batch - nv_samples, axis=0)
        self._dhb = self._momentum * self._dhb +\
                    np.mean(ph_mean - nh_means, axis=0)
        self.W += self._learning_rate * self._dW
        self.vb += self._learning_rate * self._dvb
        self.hb += self._learning_rate * self._dhb

        # remember state if needed
        if self.persistent:
            self._persistent = nh_samples

        return np.mean(np.square(X_batch - nv_means))

    def batch_iter(self, X):
        n_batches = len(X) / self.batch_size
        for i in xrange(n_batches):
            start = i * self.batch_size
            end = start + self.batch_size
            X_batch = X[start:end]
            yield X_batch
        if n_batches * self.batch_size < len(X):
            yield X[end:]

    def train_epoch(self, X):
        mean_recons = []
        for i, X_batch in enumerate(self.batch_iter(X)):
            mean_recons.append(self.update(X_batch))
            if self.verbose and i % (len(X) / (self.batch_size * 16)) == 0:
                print_inline('.')
        if self.verbose: print_inline(' ')
        return np.mean(mean_recons)

    def _fit(self, X):
        if not self._initialized:
            layer = FullyConnected(self.n_hidden,
                                   bias=0.,
                                   random_seed=self.random_seed)
            layer.setup_weights(X.shape)
            self.W = layer.W
            self.vb = np.zeros(X.shape[1])
            self.hb = layer.b
            self._dW = np.zeros_like(self.W)
            self._dvb = np.zeros_like(self.vb)
            self._dhb = np.zeros_like(self.hb)
            self._rng = RNG(self.random_seed)
        self._rng.reseed()
        timer = Stopwatch(verbose=False).start()
        for _ in xrange(self.n_epochs):
            self.epoch += 1
            if self.verbose:
                print_inline('Epoch {0:>{1}}/{2} '.format(
                    self.epoch, len(str(self.n_epochs)), self.n_epochs))

            if isinstance(self.learning_rate, str):
                S, F = map(float, self.learning_rate.split('->'))
                self._learning_rate = S + (F - S) * (
                    1. - np.exp(-(self.epoch - 1.) / 8.)) / (
                        1. - np.exp(-(self.n_epochs - 1.) / 8.))
            else:
                self._learning_rate = self.learning_rate

            if isinstance(self.momentum, str):
                S, F = map(float, self.momentum.split('->'))
                self._momentum = S + (F - S) * (
                    1. - np.exp(-(self.epoch - 1) / 4.)) / (
                        1. - np.exp(-(self.n_epochs - 1) / 4.))
            else:
                self._momentum = self.momentum

            mean_recon = self.train_epoch(X)
            if mean_recon < self.best_recon:
                self.best_recon = mean_recon
                self.best_epoch = self.epoch
                self.best_W = self.W.copy()
                self.best_vb = self.vb.copy()
                self.best_hb = self.hb.copy()
                self._early_stopping = self.early_stopping
            msg = 'elapsed: {0} sec'.format(
                width_format(timer.elapsed(), default_width=5,
                             max_precision=2))
            msg += ' - recon. mse: {0}'.format(
                width_format(mean_recon, default_width=6, max_precision=4))
            msg += ' - best r-mse: {0}'.format(
                width_format(self.best_recon, default_width=6,
                             max_precision=4))
            if self.early_stopping:
                msg += ' {0}*'.format(self._early_stopping)
            if self.verbose:
                print msg
            if self._early_stopping == 0:
                return
            if self.early_stopping:
                self._early_stopping -= 1

    def _serialize(self, params):
        for attr in ('W', 'best_W', 'vb', 'best_vb', 'hb', 'best_hb'):
            if attr in params and params[attr] is not None:
                params[attr] = params[attr].tolist()
        return params

    def _deserialize(self, params):
        for attr in ('W', 'best_W', 'vb', 'best_vb', 'hb', 'best_hb'):
            if attr in params and params[attr] is not None:
                params[attr] = np.asarray(params[attr])
        return params