Ejemplo n.º 1
0
Archivo: core.py Proyecto: torfjelde/ml
    def free_energy(self, v, beta=1.0, raw=False):
        if self.hidden_type == UnitType.BERNOULLI:
            hidden = self.h_bias + np.matmul((v / self.v_sigma), self.W)
            hidden *= beta
            hidden = -np.sum(np.log(1.0 + np.exp(np.clip(hidden, -30, 30))),
                             axis=1)
        elif self.hidden_type == UnitType.GAUSSIAN:
            # TODO: Implement
            # Have the formulas, but gotta make sure yo!
            hidden = np.sum(
                1 / (2 * self.h_sigma) *
                (self.h_bias**2 -
                 (self.h_bias +
                  self.h_sigma * np.matmul(v / self.v_sigma, self.W))**2),
                axis=1)
            hidden -= 0.5 * self.num_hidden * np.log(2 * np.pi) + np.sum(
                np.log(self.h_sigma))
            # raise NotImplementedError()

        if self.visible_type == UnitType.BERNOULLI:
            visible = -np.matmul(v, self.v_bias)
            visible *= beta
        elif self.visible_type == UnitType.GAUSSIAN:
            visible = 0.5 * np.sum(
                ((v - self.v_bias)**2) /
                (self.v_sigma**2 / beta + np.finfo(np.float32).eps),
                axis=1)
        else:
            raise ValueError(f"unknown type {self.visible_type}")

        # sum across batch to obtain log of joint-likelihood
        if raw:
            return hidden + visible
        else:
            return np.mean(hidden + visible)
Ejemplo n.º 2
0
def test_metropolis_hastings(samples, proba, proposal_proba, proposal_sample):
    # initialize kernel
    kernel = sampling.MetropolisHastingsKernel(proba, proposal_sample,
                                               proposal_proba)

    # test kernel
    state = 1.0
    state = kernel.sample(state)
    print(state)

    # test sampler
    sampler = sampling.Sampler(kernel, show_progress=True)
    trace = sampler.run(initial=state)

    # verify the sampler produces reasonable results
    target_mean = np.mean(samples)
    target_std = np.std(samples)
    assert np.abs(np.mean(trace) - target_mean) < 0.1
    assert np.abs(np.std(trace) - target_std) < 0.1
Ejemplo n.º 3
0
def test_gaussian_rbm(mnist_data):
    np.random.seed(RANDOM_SEED)
    X, _, _, _ = mnist_data
    batch_size = 10
    visible_size = X.shape[1]

    # Gaussian RBMs are VERY sensitive to params on MNIST
    # and `hidden_size == 250` just happens to work.
    hidden_size = 300

    X = (X - np.mean(X, axis=0) /
         (np.std(X, axis=0) + np.finfo(np.float32).eps))
    X[np.isnan(X)] = 1.0
    v = X[:batch_size].astype(np.float64)
    model = GaussianRBM(visible_size,
                        hidden_size,
                        estimate_visible_sigma=False)

    # verify shapes
    rbm_verify_shapes(model, v, batch_size, visible_size, hidden_size)

    # train :)
    rbm_train_single_sample(model, v)
Ejemplo n.º 4
0
Archivo: core.py Proyecto: torfjelde/ml
    def fit(self,
            train_data,
            k=1,
            learning_rate=0.01,
            num_epochs=5,
            batch_size=64,
            test_data=None,
            show_progress=True,
            weight_decay=0.0,
            early_stopping=-1,
            callbacks={},
            **sampler_kwargs):
        """
        Parameters
        ----------
        train_data: array-like
            Data to fit RBM on.
        k: int, default=1
            Number of sampling steps to perform. Used by CD-k, PCD-k and PT.
        learning_rate: float or array, default=0.01
            Learning rate used when updating the parameters.
            Can also be array of same length as `self.variables`, in
            which case the learning rate at index `i` will be used to
            to update ``RBM.variables[i]``.
        num_epochs: int, default=5
            Number of epochs to train.
        batch_size: int, default=64
            Batch size to within the epochs.
        test_data: array-like, default=None
            Data similar to ``train_data``, but this will only be used as
            validation data, not trained on.
            If specified, will compute and print the free energy / negative
            log-likelihood on this dataset after each epoch.
        show_progress: bool, default=True
            If true, will display progress bar for each epoch.
        weight_decay: float, default=0.0
            If greater than 0.0, weight decay will be applied to the
            parameter updates. See :func:`RBM.step` for more information.
        early_stopping: int, default=-1
            If ``test_data`` is given and ``early_stopping > 0``, training
            will terminate after epoch if the free energy of the
            ``test_data`` did not improve over the fast ``early_stopping``
            epochs.

        Returns
        -------
        nlls_train, nlls_test : array-like, array-like
            Returns the free energy of both ``train_data`` and ``test_data``
            as computed at each epoch.

        """
        num_samples = train_data.shape[0]
        indices = np.arange(num_samples)
        np.random.shuffle(indices)

        nlls_train = []
        nlls = []

        prev_best = None

        for epoch in range(1, num_epochs + 1):
            if "pre_epoch" in callbacks:
                for c in callbacks["pre_epoch"]:
                    c(self, epoch)

            # reset sampler at beginning of epoch
            # Used by methods such as PCD to reset the
            # initialization value.
            self.reset_sampler()

            # compute train & test negative log-likelihood
            # TODO: compute train- and test-nll in mini-batches
            # to avoid numerical problems
            nll_train = float(np.mean(self.free_energy(train_data)))
            nlls_train.append(nll_train)
            _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (train):"
                      f" {nll_train:>20.5f}")

            if test_data is not None:
                nll = float(np.mean(self.free_energy(test_data)))
                _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (test):"
                          f"  {nll:>20.5f}")
                nlls.append(nll)

                # stop early if all `early_stopping` previous
                # evaluations on `test_data` did not improve.
                if early_stopping > 0:
                    if epoch > early_stopping and \
                       np.all([a >= prev_best for a in nlls[epoch - early_stopping:]]):
                        _log.info(
                            "Hasn't improved in {early_stopping} epochs; stopping early"
                        )
                        break
                    else:
                        # update `prev_best`
                        if prev_best is None:
                            prev_best = nll
                        elif nll < prev_best:
                            prev_best = nll

            # iterate through dataset in batches
            if show_progress:
                bar = tqdm(total=num_samples)
            for start in range(0, num_samples, batch_size):
                # ensure we don't go out-of-bounds
                end = min(start + batch_size, num_samples)

                # take a gradient-step
                self.step(train_data[start:end],
                          k=k,
                          lr=learning_rate,
                          lmbda=weight_decay,
                          **sampler_kwargs)

                if "post_step" in callbacks:
                    for c in callbacks["post_step"]:
                        c(self, epoch, end)

                # update progress
                if show_progress:
                    bar.update(end - start)

            if show_progress:
                bar.close()

            # shuffle indices for next epoch
            np.random.shuffle(indices)

            if "post_epoch" in callbacks:
                for c in callbacks["post_epoch"]:
                    c(self, epoch)

        # compute train & test negative log-likelihood of final batch
        nll_train = float(np.mean(self.free_energy(train_data)))
        nlls_train.append(nll_train)
        _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (train): "
                  f"{nll_train:>20.5f}")

        if test_data is not None:
            nll = float(np.mean(self.free_energy(test_data)))
            _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (test):  "
                      f"{nll:>20.5f}")
            nlls.append(nll)

        return nlls_train, nlls
Ejemplo n.º 5
0
Archivo: core.py Proyecto: torfjelde/ml
    def grad(self, v, burnin=-1, persist=False, **sampler_kwargs):
        if self.sampler_method.lower() == 'cd':
            v_0, h_0, v_k, h_k = self.contrastive_divergence(
                v, **sampler_kwargs)
        elif self.sampler_method.lower() == 'pcd':
            # Persistent Contrastive Divergence
            if self._prev is not None:
                v_0, h_0 = self._prev
            else:
                # ``burnin`` specified, we perform this to initialize the chain
                if burnin > 0:
                    _log.info(
                        f"Performing burnin of {burnin} steps to initialize PCD"
                    )
                    _, _, h_0, v_0 = self.contrastive_divergence(
                        v, k=burnin, **sampler_kwargs)
                else:
                    h_0 = self.sample_hidden(v, **sampler_kwargs)
                    v_0 = v

            v_0, h_0, v_k, h_k = self.contrastive_divergence(v,
                                                             h_0=h_0,
                                                             **sampler_kwargs)

            # persist
            self._prev = (v_k, h_k)

        elif self.sampler_method.lower() == 'pt':
            h_0 = None
            if self._prev is not None:
                v_0, h_0 = self._prev
            else:
                _log.info("Initializing PT chain...")
                v_0 = self._init_parallel_tempering(v, **sampler_kwargs)

            # FIXME: make compatible with `parallel_tempering` returning
            # all the states
            if h_0 is None:
                v_0, h_0, v_k, h_k = self.parallel_tempering(
                    v_0, hs=h_0, include_negative_shift=True, **sampler_kwargs)
            elif sampler_kwargs.get("include_negative_shift", False):
                v_0, h_0, v_k, h_k = self.parallel_tempering(v_0,
                                                             hs=h_0,
                                                             **sampler_kwargs)
            else:
                # FIXME: make compatible with `parallel_tempering` returning
                # all the states
                v_k, h_k = self.parallel_tempering(v_0,
                                                   hs=h_0,
                                                   **sampler_kwargs)

            if persist:
                self._prev = (v_k, h_k)

            # take the first tempered distribution, i.e. the one corresponding
            # the target distribution
            v_0 = v_0[0]
            h_0 = h_0[0]
            v_k = v_k[0]
            h_k = v_k[0]
        else:
            raise ValueError(f"{self.sampler_method} is not supported")

        # all expressions below using `v` or `mean_h` will contain
        # AT LEAST one factor of `1 / v_sigma` and `1 / h_sigma`, respectively
        # so we include those right away
        v_0 = v_0 / self.v_sigma
        v_k = v_k / self.v_sigma
        mean_h_0 = self.mean_hidden(v_0) / self.h_sigma
        mean_h_k = self.mean_hidden(v_k) / self.h_sigma

        # Recall: `v_sigma` and `h_sigma` has no affect if they are set to 1
        # v_0 / (v_sigma^2) - v_k / (v_sigma^2)
        delta_v_bias = (v_0 - v_k) / self.v_sigma
        # E[h_0 | v_0] / (h_sigma^2) - E[h_k | v_k] / (h_sigma^2)
        delta_h_bias = (mean_h_0 - mean_h_k) / self.h_sigma

        # Gradient wrt. W
        # (v_0 / v_sigma) (1 / h_sigma) E[h_0 | v_0] - (v_k / v_sigma) (1 / h_sigma) E[h_k | v_k]
        x = mean_h_0.reshape(mean_h_0.shape[0], 1, mean_h_0.shape[1])
        y = v_0.reshape(v_0.shape[0], v_0.shape[1], 1)
        z_0 = np.matmul(y, x)

        x = mean_h_k.reshape(mean_h_k.shape[0], 1, mean_h_k.shape[1])
        y = v_k.reshape(v_k.shape[0], v_k.shape[1], 1)
        z_k = np.matmul(y, x)

        delta_W = z_0 - z_k

        # average over batch take the negative
        delta_v_bias = -np.mean(delta_v_bias, axis=0)
        delta_h_bias = -np.mean(delta_h_bias, axis=0)
        delta_W = -np.mean(delta_W, axis=0)

        grads = [delta_v_bias, delta_h_bias, delta_W]

        # variances
        if self.visible_type == UnitType.GAUSSIAN \
           and self.estimate_visible_sigma:
            # in `GaussianRBM`, where only VISIBLE units Gaussian,
            # we only compute `v_sigma`
            # (((v_0 - b)^2 / (v_sigma^2)) - (v / (v_sigma)) \sum_{\mu} E[h_{\mu} | v] / sigma_{\mu}) / v_sigma
            delta_v_sigma_data = (((v_0 - (self.v_bias / self.v_sigma))**2) -
                                  v_0 * (np.matmul(mean_h_0, self.W.T)))
            delta_v_sigma_model = (((v_k - (self.v_bias / self.v_sigma))**2) -
                                   v_k * (np.matmul(mean_h_k, self.W.T)))
            delta_v_sigma = (delta_v_sigma_data -
                             delta_v_sigma_model) / self.v_sigma
            # average over batch take the negative
            delta_v_sigma = -np.mean(delta_v_sigma, axis=0)

            grads.append(delta_v_sigma)

        if self.hidden_type == UnitType.GAUSSIAN \
           and self.estimate_hidden_sigma:
            # TODO: Implement
            raise NotImplementedError("gradients for gaussian hidden"
                                      " units not yet implemented")

            delta_h_sigma_data = (((h_0 - (self.h_bias / self.h_sigma))**2) -
                                  h_0 * (np.matmul(mean_h_0, self.W.T)))
            delta_h_sigma_model = (((h_k - (self.h_bias / self.h_sigma))**2) -
                                   h_k * (np.matmul(mean_h_k, self.W.T)))
            delta_h_sigma = delta_h_sigma_data - delta_h_sigma_model
            # average over batch take the negative
            delta_h_sigma = -np.mean(delta_h_sigma, axis=0)

            grads.append(delta_h_sigma)

        return grads