Ejemplo n.º 1
0
    def __max_gaussians_1d(self, means1: torch.Tensor, vars1: torch.tensor, means2: torch.tensor, vars2: torch.Tensor):
        """
        Computes max of gaussians over a single dimension. Each element in the parameteres corresponds on of the
        parameters of one of the gaussians
        :param means1: a vector of gaussian means
        :param vars1: a vector of gaussian variances
        :param means2:
        :param vars2:
        :return: tuple (means, vars) with means and variances, respectively in tensors
        """
        alpha = torch.sqrt(vars1 + vars2 + self._epsilon)
        beta = (means1 - means2) / alpha
        n = Normal(0, 1)
        cdf_beta = n.cdf(beta)
        cdf_neg_beta = n.cdf(-beta)
        pdf_beta = torch.exp(n.log_prob(beta))

        mean_max = means1 * cdf_beta + means2 * cdf_neg_beta + alpha * pdf_beta

        # the way the variance is calculated here may cause it to become variance. Epsilon is added to try to avoid that
        var_max = (vars1 + torch.pow(means1, 2)) * cdf_beta + (vars2 + torch.pow(means2, 2)) * cdf_neg_beta + \
                  (means1 + means2) * alpha * pdf_beta - torch.pow(mean_max, 2) + self._epsilon
        if torch.any(var_max < 0):
            raise ValueError("Pooling layer: variance is negative. Epsilon should be increased")

        return mean_max, var_max
Ejemplo n.º 2
0
 def log_prior(self, theta):
     theta = self.scale_theta(theta)
     if self.prior == 'uniform':
         return 0
     else:
         prior = Normal(loc=self.prior_loc, scale=self.prior_scale)
         if self.num_bits is None:
             return sum_except_batch(prior.log_prob(theta))
         else:
             return sum_except_batch(torch.log(prior.cdf(theta+1)-prior.cdf(theta)+1e-12))
Ejemplo n.º 3
0
def sample_truncated_normal_perturbations(
    X: Tensor,
    n_discrete_points: int,
    sigma: float,
    bounds: Tensor,
    qmc: bool = True,
) -> Tensor:
    r"""Sample points around `X`.

    Sample perturbed points around `X` such that the added perturbations
    are sampled from N(0, sigma^2 I) and truncated to be within [0,1]^d.

    Args:
        X: A `n x d`-dim tensor starting points.
        n_discrete_points: The number of points to sample.
        sigma: The standard deviation of the additive gaussian noise for
            perturbing the points.
        bounds: A `2 x d`-dim tensor containing the bounds.
        qmc: A boolean indicating whether to use qmc.

    Returns:
        A `n_discrete_points x d`-dim tensor containing the sampled points.
    """
    X = normalize(X, bounds=bounds)
    d = X.shape[1]
    # sample points from N(X_center, sigma^2 I), truncated to be within
    # [0, 1]^d.
    if X.shape[0] > 1:
        rand_indices = torch.randint(X.shape[0], (n_discrete_points, ),
                                     device=X.device)
        X = X[rand_indices]
    if qmc:
        std_bounds = torch.zeros(2, d, dtype=X.dtype, device=X.device)
        std_bounds[1] = 1
        u = draw_sobol_samples(bounds=std_bounds, n=n_discrete_points,
                               q=1).squeeze(1)
    else:
        u = torch.rand((n_discrete_points, d), dtype=X.dtype, device=X.device)
    # compute bounds to sample from
    a = -X
    b = 1 - X
    # compute z-score of bounds
    alpha = a / sigma
    beta = b / sigma
    normal = Normal(0, 1)
    cdf_alpha = normal.cdf(alpha)
    # use inverse transform
    perturbation = normal.icdf(cdf_alpha + u *
                               (normal.cdf(beta) - cdf_alpha)) * sigma
    # add perturbation and clip points that are still outside
    perturbed_X = (X + perturbation).clamp(0.0, 1.0)
    return unnormalize(perturbed_X, bounds=bounds)
Ejemplo n.º 4
0
 def log_prob(self, x):
     log_prob = torch.zeros(x.shape[0], self.num_mix)
     for d in range(self.num_dims):
         xd_low = 2 * (x[:, d] / 2**self.num_bits) - 1
         xd_high = 2 * ((x[:, d] + 1.0) / 2**self.num_bits) - 1
         xd_low[x[:, d] == 0] = -1e16
         xd_high[x[:, d] == 2**self.num_bits - 1] = +1e16
         for m in range(self.num_mix):
             dm = Normal(self.loc[m, d], self.log_scale[m, d].exp())
             prob_dm = dm.cdf(xd_high) - dm.cdf(xd_low)
             log_prob[:, m] += torch.log(prob_dm + self.eps)
     return torch.logsumexp(log_prob +
                            torch.log_softmax(self.logit_pi, dim=-1),
                            dim=-1)
Ejemplo n.º 5
0
def marginal_calibration(pred_y_mean, pred_y_var, y_true, ax):
    mean = torch.Tensor(pred_y_mean).cpu().squeeze()
    var = torch.Tensor(pred_y_var).cpu().squeeze()
    y = torch.Tensor(y_true).cpu().squeeze()

    dist = Normal(mean, var.sqrt())

    # calc and display difference of empirical cdf an avg predictive cdf (like in
    # "Gneiting, T., Balabdaoui, F., & Raftery, A. E. (2007). Probabilistic forecasts, calibration and sharpness.
    # Journal of the Royal Statistical Society: Series B (Statistical Methodology), 69(2), 243-268.")

    emp_cdf = lambda x: (y <= x.unsqueeze(-1)).double().mean(-1)
    avg_pred_cdf = lambda x: dist.cdf(x.unsqueeze(-1)).mean(-1)

    min_x = y.min()
    max_x = y.max()
    eps = np.abs(max_x - min_x) * 0.05
    min_x = min_x - eps
    max_x = max_x + eps

    step = (max_x - min_x) / 1000
    plt_x = torch.arange(min_x, max_x, step)

    pcdf = avg_pred_cdf(plt_x)
    ecdf = emp_cdf(plt_x)

    dif = pcdf - ecdf

    ax.plot(plt_x, dif, color='lightblue')

    ax.plot(plt_x,
            np.repeat(0, plt_x.shape),
            color='lightgray',
            linestyle="--",
            alpha=0.75)
Ejemplo n.º 6
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate Expected Improvement on the candidate set X.

        Args:
            X: A `b1 x ... bk x 1 x d`-dim batched tensor of `d`-dim design points.
                Expected Improvement is computed for each point individually,
                i.e., what is considered are the marginal posteriors, not the
                joint.

        Returns:
            A `b1 x ... bk`-dim tensor of Expected Improvement values at the
            given design points `X`.
        """
        self.best_f = self.best_f.to(X)
        posterior = self.model.posterior(X)
        self._validate_single_output_posterior(posterior)
        mean = posterior.mean
        # deal with batch evaluation and broadcasting
        view_shape = mean.shape[:-2] if mean.dim() >= X.dim() else X.shape[:-2]
        mean = mean.view(view_shape)
        sigma = posterior.variance.clamp_min(1e-9).sqrt().view(view_shape)
        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        ucdf = normal.cdf(u)
        updf = torch.exp(normal.log_prob(u))
        ei = sigma * (updf + u * ucdf)
        return ei
Ejemplo n.º 7
0
 def mean(self, context, feedback=None):
     # get mean of truncated normal
     mean, std = self.cond_dist_params(context, feedback=feedback)
     std_normal = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device))
     adjusted_a = (0 - mean) / std
     additional = std * std_normal.log_prob(adjusted_a).exp() / (1 - std_normal.cdf(adjusted_a))
     return mean + additional
Ejemplo n.º 8
0
def EI_fstar_known(model, x, fstar, min=True):
    '''
    This function calculates the Expected Improvement (EGO) acquisition function
    
    INPUT :
    model   : GPY.model - A GPY model from which we will estimate.
    x       : Float     - an x value to evaluate
    min     : Boolean   - determines if the function is a minimisation or maximisation problem
    
    OUTPUT :
    output  : Float     - returns the estimated improvement
    '''

    x = torch.from_numpy(np.array([[x]]).reshape(-1, xdims))
    #x = x.float()
    x = x.double()

    model.eval()
    posterior = model.posterior(x)
    mean = posterior.mean
    sigma = posterior.variance.clamp_min(1e-9).sqrt()
    val = mean - fstar
    u = val / sigma
    if min == True:
        u = -u
    normal = Normal(torch.zeros_like(u), torch.ones_like(u))
    ucdf = normal.cdf(u)
    updf = torch.exp(normal.log_prob(u))
    ei = sigma * (updf + u * ucdf)

    return (ei.item(), mean.item())
Ejemplo n.º 9
0
    def generate_parameters(self, parameter_id, **kwargs):
        if not self._model_initialized:
            return _random_config(self.searchspace_json, self.random_state)
        else:
            # random samples and pick best with model
            candidate_x = [
                _random_config(self.searchspace_json, self.random_state)
                for _ in range(self.sample_size)
            ]

            x_test = np.array(
                [np.array(list(xi.values())) for xi in candidate_x])
            m, v = self.model.predict(x_test)
            mean = torch.Tensor(m)
            sigma = torch.Tensor(v)
            u = (mean - torch.Tensor([0.95]).expand_as(mean)) / sigma
            normal = Normal(torch.zeros_like(u), torch.ones_like(u))
            ucdf = normal.cdf(u)
            updf = torch.exp(normal.log_prob(u))
            ei = sigma * (updf + u * ucdf)

            if self.optimize_mode == 'maximize':
                ind = torch.argmax(ei)
            else:
                ind = torch.argmin(ei)
            new_x = candidate_x[ind]
            return new_x
Ejemplo n.º 10
0
    def eval(self, x: torch.FloatTensor,
             xe: torch.LongTensor) -> torch.FloatTensor:
        """
        minimize (-1 * EI,  -1 * PI, lcb)
        """
        with torch.no_grad():
            py, ps2 = self.model.predict(x, xe)
            noise = np.sqrt(2.0) * self.model.noise.sqrt()
            ps = ps2.sqrt()
            lcb = (py + noise * torch.randn(py.shape)) - self.kappa * ps
            normed = (
                (self.tau - self.eps - py - noise * torch.randn(py.shape)) /
                ps)
            dist = Normal(0., 1.)
            log_phi = dist.log_prob(normed)
            Phi = dist.cdf(normed)
            PI = Phi
            EI = ps * (Phi * normed + log_phi.exp())
            logEIapp = ps.log() - 0.5 * normed**2 - (normed**2 - 1).log()
            logPIapp = -0.5 * normed**2 - torch.log(-1 * normed) - torch.log(
                torch.sqrt(torch.tensor(2 * np.pi)))

            use_app = ~((normed > -6) & torch.isfinite(EI.log())
                        & torch.isfinite(PI.log())).reshape(-1)
            out = torch.zeros(x.shape[0], 3)
            out[:, 0] = lcb.reshape(-1)
            out[:, 1][use_app] = -1 * logEIapp[use_app].reshape(-1)
            out[:, 2][use_app] = -1 * logPIapp[use_app].reshape(-1)
            out[:, 1][~use_app] = -1 * EI[~use_app].log().reshape(-1)
            out[:, 2][~use_app] = -1 * PI[~use_app].log().reshape(-1)
            return out
Ejemplo n.º 11
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate Expected Improvement on the candidate set X.

        Args:
            X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points.
                Expected Improvement is computed for each point individually,
                i.e., what is considered are the marginal posteriors, not the
                joint.

        Returns:
            A `(b1 x ... bk)`-dim tensor of Expected Improvement values at the
            given design points `X`.
        """
        self.best_f = self.best_f.to(X)
        posterior = self.model.posterior(
            X=X, posterior_transform=self.posterior_transform)
        mean = posterior.mean
        # deal with batch evaluation and broadcasting
        view_shape = mean.shape[:-2] if mean.shape[-2] == 1 else mean.shape[:-1]
        mean = mean.view(view_shape)
        sigma = posterior.variance.clamp_min(1e-9).sqrt().view(view_shape)
        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        ucdf = normal.cdf(u)
        updf = torch.exp(normal.log_prob(u))
        ei = sigma * (updf + u * ucdf)
        return ei
Ejemplo n.º 12
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate Constrained Expected Improvement on the candidate set X.

        Args:
            X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design
                points each.

        Returns:
            A `(b)`-dim Tensor of Expected Improvement values at the given
            design points `X`.
        """
        posterior = self.model.posterior(X)
        means = posterior.mean.squeeze(dim=-2)  # (b) x t
        sigmas = posterior.variance.squeeze(dim=-2).sqrt().clamp_min(1e-9)  # (b) x t

        # (b) x 1
        mean_obj = means[..., [self.objective_index]]
        sigma_obj = sigmas[..., [self.objective_index]]
        u = (mean_obj - self.best_f.expand_as(mean_obj)) / sigma_obj
        if not self.maximize:
            u = -u
        normal = Normal(
            torch.zeros(1, device=u.device, dtype=u.dtype),
            torch.ones(1, device=u.device, dtype=u.dtype),
        )
        ei_pdf = torch.exp(normal.log_prob(u))  # (b) x 1
        ei_cdf = normal.cdf(u)
        ei = sigma_obj * (ei_pdf + u * ei_cdf)
        prob_feas = self._compute_prob_feas(X=X, means=means, sigmas=sigmas)
        ei = ei.mul(prob_feas)
        return ei.squeeze(dim=-1)
Ejemplo n.º 13
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate Constrained Expected Improvement on the candidate set X.

        Args:
            X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design
                points each.

        Returns:
            A `(b)`-dim Tensor of Expected Improvement values at the given
            design points `X`.
        """
        self.best_f = self.best_f.to(X)
        posterior = self.model.posterior(
            X=X, posterior_transform=self.posterior_transform)
        means = posterior.mean.squeeze(dim=-2)  # (b) x m
        sigmas = posterior.variance.squeeze(dim=-2).sqrt().clamp_min(
            1e-9)  # (b) x m

        # (b) x 1
        oi = self.objective_index
        mean_obj = means[..., oi:oi + 1]
        sigma_obj = sigmas[..., oi:oi + 1]
        u = (mean_obj - self.best_f.expand_as(mean_obj)) / sigma_obj
        if not self.maximize:
            u = -u
        normal = Normal(
            torch.zeros(1, device=u.device, dtype=u.dtype),
            torch.ones(1, device=u.device, dtype=u.dtype),
        )
        ei_pdf = torch.exp(normal.log_prob(u))  # (b) x 1
        ei_cdf = normal.cdf(u)
        ei = sigma_obj * (ei_pdf + u * ei_cdf)
        prob_feas = self._compute_prob_feas(X=X, means=means, sigmas=sigmas)
        ei = ei.mul(prob_feas)
        return ei.squeeze(dim=-1)
Ejemplo n.º 14
0
    def forward(self, X: Tensor) -> Tensor:
        r"""
        Approximates E_n[CVaR[F]] as described in ApxCVaRKG.
        :param X: The decision variable `x` and the `\beta` value.
            Shape: batch x num_fantasies x num_starting_sols x 1 x (dim_x + 1) (see below)
        :return: -E_n[CVaR[F(x, W)]].
            Shape: batch x num_fantasies x num_starting_sols
            Note that the return value is negated since the optimizers we use do
            maximization.
        """
        if X.requires_grad:
            torch.set_grad_enabled(True)
        # ensure X has the correct dtype and device
        X = X.to(self.w_samples)
        # make sure X has proper shape, 4 dimensional to match the batch shape of rhoKG
        assert X.shape[-1] == self.dim_x + 1
        if X.dim() < 4:
            X = X.reshape(-1, *self.model._input_batch_shape, 1,
                          self.dim_x + 1)

        X_fant = X[..., :self.dim_x]  # batch x num_fantasies x n x 1 x dim_x
        beta = X[..., -1:]  # batch x num_fantasies x n x 1 x 1

        # Join X_fant with w_samples
        z_fant = torch.cat(
            [
                X_fant.repeat(*[1] * (X_fant.dim() - 2), self.num_samples, 1),
                self.w_samples.repeat(*X_fant.shape[:-2], 1, 1),
            ],
            dim=-1,
        )
        # get posterior mean and std dev
        with settings.propagate_grads(True):
            posterior = self.model.posterior(z_fant)
            mu = posterior.mean
            sigma = torch.sqrt(posterior.variance)

        # Calculate `E_f[[f(x) - \beta]^+]`
        u = (mu - beta.expand_as(mu)) / sigma
        # this is from EI
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        ucdf = normal.cdf(u)
        updf = torch.exp(normal.log_prob(u))
        values = sigma * (updf + u * ucdf)
        # take the expectation over W
        if getattr(self, "weights", None) is None:
            values = torch.mean(values, dim=-2)
        else:
            # Get the expectation with weights
            values = values * self.weights.unsqueeze(-1)
            values = torch.sum(values, dim=-2)
        # add beta and divide by 1-alpha
        values = beta.view_as(values) + values / (1 - self.alpha)
        # return with last dim squeezed
        # negated since CVaR is being minimized
        return -values.squeeze(-1)
Ejemplo n.º 15
0
 def sample(self, context, feedback=None):
     mean, std = self.cond_dist_params(context, feedback=feedback)
     std_normal = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device))
     mu = self.mean(context)
     alpha = (0 - mean) / std
     z = 1 - std_normal.cdf(alpha)
     phi_alpha = std_normal.log_prob(alpha).exp()
     sigma = std * torch.sqrt(1 + alpha * phi_alpha / z - (phi_alpha / z) ** 2)
     dist = Normal(mu, sigma)
     return dist.rsample().abs()
Ejemplo n.º 16
0
def pit_calc(means, vars, targets):
    mean = torch.Tensor(means).cpu().squeeze()
    var = torch.Tensor(vars).cpu().squeeze()
    y = torch.Tensor(targets).cpu().squeeze()

    dist = Normal(mean, var.sqrt())
    pt = dist.cdf(y)

    pt = pt.squeeze().numpy()

    return pt
Ejemplo n.º 17
0
def interval_coverage(pred_y_mean, pred_y_var, y_true, interval):
    # "the proportion of the time that the interval contains the true value of interest"
    mean = torch.Tensor(pred_y_mean).cpu()
    var = torch.Tensor(pred_y_var).cpu()
    y = torch.Tensor(y_true).cpu()

    dist = Normal(mean, var.sqrt())
    cov = dist.cdf(y) <= interval
    cov = cov.sum() / float(cov.shape[0])

    return cov.numpy()
Ejemplo n.º 18
0
 def log_prob(self, x, context, should_sum=True, feedback=None):
     mean, std = self.cond_dist_params(context, feedback=feedback)
     dist = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device))
     adjusted_x = (x - mean) / std
     adjusted_a = (0 - mean) / std
     log_gx = dist.log_prob(adjusted_x)
     log_c = ((1 - dist.cdf(adjusted_a)) * std).log()
     log_prob = log_gx - log_c
     # return sum_except_batch(dist.log_prob((x - mean).abs()))
     '''
     # Folded normal distribution
     mean, std = self.cond_dist_params(context)
     dist1 = Normal(mean, std)
     dist2 = Normal(-mean, std)
     log_prob = (dist1.log_prob(x).exp() + dist2.log_prob(x).exp()).log()
     '''
     if should_sum:
         return sum_except_batch(log_prob)
     else:
         return log_prob
def crps_torch(mean, std, target):
    # crps
    # Gneiting, T., Raftery, A. E., Westveld III, A. H., & Goldman, T. (2005).
    # Calibrated probabilistic forecasting using ensemble model output statistics and minimum CRPS estimation.
    # Monthly Weather Review, 133(5), 1098-1118.
    # Formula 5
    sx = (target - mean) / std

    normal = Normal(torch.Tensor([0]).to(sx.device),
                    torch.Tensor([1]).to(sx.device))
    pdf = normal.log_prob(sx).exp()
    cdf = normal.cdf(sx)

    assert pdf.shape == cdf.shape == sx.shape == target.shape

    crps = std * (sx * (2 * cdf - 1) + 2 * pdf - crps_const.to(sx.device))

    assert crps.shape == target.shape

    return crps.mean(0)
Ejemplo n.º 20
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate the Probability of Improvement on the candidate set X.

        Args:
            X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design
                points each.

        Returns:
            A `(b)`-dim tensor of Probability of Improvement values at the given
            design points `X`.
        """
        self.best_f = self.best_f.to(X)
        posterior = self._get_posterior(X=X)
        mean, sigma = posterior.mean, posterior.variance.sqrt()
        batch_shape = X.shape[:-2]
        mean = posterior.mean.view(batch_shape)
        sigma = posterior.variance.sqrt().clamp_min(1e-9).view(batch_shape)
        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        return normal.cdf(u)
Ejemplo n.º 21
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate the Probability of Improvement on the candidate set X.

        Args:
            X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points.

        Returns:
            A `(b1 x ... bk)`-dim tensor of Probability of Improvement values at the
            given design points `X`.
        """
        self.best_f = self.best_f.to(X)
        posterior = self.model.posterior(
            X=X, posterior_transform=self.posterior_transform)
        mean, sigma = posterior.mean, posterior.variance.sqrt().clamp_min(1e-9)
        view_shape = mean.shape[:-2] if mean.shape[-2] == 1 else mean.shape[:-1]
        mean = mean.view(view_shape)
        sigma = sigma.view(view_shape)
        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        return normal.cdf(u)
Ejemplo n.º 22
0
    def forward(self, X: Tensor) -> Tensor:
        """
        :param X: A (..., 1, input_dim) batched tensor of input_dim design points.
                Expected Improvement is computed for each point individually,
                i.e., what is considered are the marginal posteriors, not the
                joint.
        :return:  A (...) tensor of Expected Improvement values at the
            given design points `X`.
        """

        with torch.no_grad():
            # both (..., 1,)
            # (..., input_dim)
            X_features = X.detach().numpy().squeeze(1)
            mu_est, sigma_est = self.mean_std_predictor(X_features)

            # both (..., 1, 1)
            mu_est = torch.Tensor(mu_est).unsqueeze(1)
            sigma_est = torch.Tensor(sigma_est).unsqueeze(1)

        posterior = self._get_posterior(X=X)

        mean, sigma = scale_posterior(
            mu_posterior=posterior.mean,
            sigma_posterior=posterior.variance.clamp_min(1e-6).sqrt(),
            mu_est=mu_est,
            sigma_est=sigma_est,
        )

        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        ucdf = normal.cdf(u)
        updf = torch.exp(normal.log_prob(u))
        ei = sigma * (updf + u * ucdf)

        return ei.squeeze(dim=-1).squeeze(dim=-1)
Ejemplo n.º 23
0
    def forward(self, X: Tensor) -> Tensor:
        r"""Evaluate the Probability of Improvement on the candidate set X.

        Args:
            X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design
                points each.

        Returns:
            A `(b)`-dim tensor of Probability of Improvement values at the given
            design points `X`.
        """
        self.best_f = self.best_f.to(X)
        batch_shape = X.shape[:-2]
        posterior = self.model.posterior(X)
        self._validate_single_output_posterior(posterior)
        mean, sigma = posterior.mean, posterior.variance.sqrt()
        mean = posterior.mean.view(batch_shape)
        sigma = posterior.variance.sqrt().clamp_min(1e-9).view(batch_shape)
        u = (mean - self.best_f.expand_as(mean)) / sigma
        if not self.maximize:
            u = -u
        normal = Normal(torch.zeros_like(u), torch.ones_like(u))
        return normal.cdf(u)
Ejemplo n.º 24
0
    def generate_parameters(self, parameter_id, **kwargs):
        if not self._model_initialized:
            return _random_config(self.searchspace_json, self.random_state)
        else:
            # random samples and pick best with model
            candidate_x = [
                _random_config(self.searchspace_json, self.random_state)
                for _ in range(self.sample_size)
            ]

            # The model has NaN issue when all the candidates are same
            # Also we can save the predict time when this happens
            if all(x == candidate_x[0] for x in candidate_x):
                return candidate_x[0]

            x_test = np.array(
                [np.array(list(xi.values())) for xi in candidate_x])
            m, v = self.model.predict(x_test)

            # The model has NaN issue when all the candidates are very close
            if np.isnan(m).any() or np.isnan(v).any():
                return candidate_x[0]

            mean = torch.Tensor(m)
            sigma = torch.Tensor(v)
            u = (mean - torch.Tensor([0.95]).expand_as(mean)) / sigma
            normal = Normal(torch.zeros_like(u), torch.ones_like(u))
            ucdf = normal.cdf(u)
            updf = torch.exp(normal.log_prob(u))
            ei = sigma * (updf + u * ucdf)

            if self.optimize_mode == 'maximize':
                ind = torch.argmax(ei)
            else:
                ind = torch.argmin(ei)
            new_x = candidate_x[ind]
            return new_x
Ejemplo n.º 25
0
 def eval(self, x: nx.Graph, asscalar=False):
     """
     Return the negative expected improvement at the query point x2
     """
     from torch.distributions import Normal
     try:
         mu, cov = self.gp.predict(x)
     except:
         return -1.  # in case of error. return ei of -1
     std = torch.sqrt(torch.diag(cov))
     mu_star = self._get_incumbent()
     gauss = Normal(torch.zeros(1, device=mu.device),
                    torch.ones(1, device=mu.device))
     u = (mu - mu_star - self.xi) / std
     ucdf = gauss.cdf(u)
     updf = torch.exp(gauss.log_prob(u))
     ei = std * updf + (mu - mu_star - self.xi) * ucdf
     if self.augmented_ei:
         sigma_n = self.gp.likelihood
         ei *= (1. - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) /
                torch.sqrt(sigma_n + torch.diag(cov)))
     if asscalar:
         ei = ei.detach().numpy().item()
     return ei
Ejemplo n.º 26
0
class ExpectedHypervolumeImprovement(MultiObjectiveAnalyticAcquisitionFunction):
    def __init__(
        self,
        model: Model,
        ref_point: List[float],
        partitioning: NondominatedPartitioning,
        objective: Optional[AnalyticMultiOutputObjective] = None,
    ) -> None:
        r"""Expected Hypervolume Improvement supporting m>=2 outcomes.

        This implements the computes EHVI using the algorithm from [Yang2019]_, but
        additionally computes gradients via auto-differentiation as proposed by
        [Daulton2020]_.

        Note: this is currently inefficient in two ways due to the binary partitioning
        algorithm that we use for the box decomposition:

            - We have more boxes in our decomposition
            - If we used a box decomposition that used `inf` as the upper bound for
                the last dimension *in all hypercells*, then we could reduce the number
                of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this
                by using DKLV17 and LKF17 for the box decomposition.

        TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for
        greater efficiency.

        TODO: Add support for outcome constraints.

        Example:
            >>> model = SingleTaskGP(train_X, train_Y)
            >>> ref_point = [0.0, 0.0]
            >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning)
            >>> ehvi = EHVI(test_X)

        Args:
            model: A fitted model.
            ref_point: A list with `m` elements representing the reference point (in the
                outcome space) w.r.t. to which compute the hypervolume. This is a
                reference point for the objective values (i.e. after applying
                `objective` to the samples).
            partitioning: A `NondominatedPartitioning` module that provides the non-
                dominated front and a partitioning of the non-dominated space in hyper-
                rectangles.
            objective: An `AnalyticMultiOutputObjective`.
        """
        # TODO: we could refactor this __init__ logic into a
        # HypervolumeAcquisitionFunction Mixin
        if len(ref_point) != partitioning.num_outcomes:
            raise ValueError(
                "The length of the reference point must match the number of outcomes. "
                f"Got ref_point with {len(ref_point)} elements, but expected "
                f"{partitioning.num_outcomes}."
            )
        ref_point = torch.tensor(
            ref_point,
            dtype=partitioning.pareto_Y.dtype,
            device=partitioning.pareto_Y.device,
        )
        better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1)
        if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0:
            raise ValueError(
                "At least one pareto point must be better than the reference point."
            )
        super().__init__(model=model, objective=objective)
        self.register_buffer("ref_point", ref_point)
        self.partitioning = partitioning
        cell_bounds = self.partitioning.get_hypercell_bounds(ref_point=self.ref_point)
        self.register_buffer("cell_lower_bounds", cell_bounds[0])
        self.register_buffer("cell_upper_bounds", cell_bounds[1])
        # create indexing tensor of shape `2^m x m`
        self._cross_product_indices = torch.tensor(
            list(product(*[[0, 1] for _ in range(ref_point.shape[0])])),
            dtype=torch.long,
            device=ref_point.device,
        )
        self.normal = Normal(0, 1)

    def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
        r"""Compute Psi function.

        For each cell i and outcome k:

            Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
            sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + (
            mu_k - lower_{i,k}
            ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k)
            )

        See Equation 19 in [Yang2019]_ for more details.

        Args:
            lower: A `num_cells x m`-dim tensor of lower cell bounds
            upper: A `num_cells x m`-dim tensor of upper cell bounds
            mu: A `batch_shape x 1 x m`-dim tensor of means
            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).

        Returns:
            A `batch_shape x num_cells x m`-dim tensor of values.
        """
        u = (upper - mu) / sigma
        return sigma * self.normal.log_prob(u).exp() + (mu - lower) * (
            1 - self.normal.cdf(u)
        )

    def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
        r"""Compute Nu function.

        For each cell i and outcome k:

            nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
            upper_{i,k} - lower_{i,k}
            ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k))

        See Equation 25 in [Yang2019]_ for more details.

        Args:
            lower: A `num_cells x m`-dim tensor of lower cell bounds
            upper: A `num_cells x m`-dim tensor of upper cell bounds
            mu: A `batch_shape x 1 x m`-dim tensor of means
            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).

        Returns:
            A `batch_shape x num_cells x m`-dim tensor of values.
        """
        return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma))

    @t_batch_mode_transform()
    def forward(self, X: Tensor) -> Tensor:
        posterior = self.objective(self.model.posterior(X))
        mu = posterior.mean
        sigma = posterior.variance.clamp_min(1e-9).sqrt()
        # clamp here, since upper_bounds will contain `inf`s, which
        # are not differentiable
        cell_upper_bounds = self.cell_upper_bounds.clamp_max(
            1e10 if X.dtype == torch.double else 1e8
        )
        # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2
        psi_lu = self.psi(
            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
        )
        # Compute psi(lower_m, lower_m, mu_m, sigma_m)
        psi_ll = self.psi(
            lower=self.cell_lower_bounds,
            upper=self.cell_lower_bounds,
            mu=mu,
            sigma=sigma,
        )
        # Compute nu(lower_m, upper_m, mu_m, sigma_m)
        nu = self.nu(
            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
        )
        # compute the difference psi_ll - psi_lu
        psi_diff = psi_ll - psi_lu

        # this is batch_shape x num_cells x 2 x (m-1)
        stacked_factors = torch.stack([psi_diff, nu], dim=-2)

        # Take the cross product of psi_diff and nu across all outcomes
        # e.g. for m = 2
        # for each batch and cell, compute
        # [psi_diff_0, psi_diff_1]
        # [nu_0, psi_diff_1]
        # [psi_diff_0, nu_1]
        # [nu_0, nu_1]
        # this tensor has shape: `batch_shape x num_cells x 2^m x m`
        all_factors_up_to_last = stacked_factors.gather(
            dim=-2,
            index=self._cross_product_indices.expand(
                stacked_factors.shape[:-2] + self._cross_product_indices.shape
            ),
        )
        # compute product for all 2^m terms,
        # sum across all terms and hypercells
        return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1)
Ejemplo n.º 27
0
      normal.log_prob(value=torch.Tensor([-1, 0, .5])), "\n")
print("log-likelihood given value with (2,3):\n",
      normal.log_prob(value=torch.Tensor([[-1, 0, .5], [-2, 1, 3]])))

print("log-probability given value with shape ():\n",
      binomial.log_prob(value=torch.Tensor([5])), "\n")
print("log-probability given value with (3,):\n",
      binomial.log_prob(value=torch.Tensor([5, 3, 7])), "\n")
print("log-probability given value with (2,3):\n",
      binomial.log_prob(value=torch.Tensor([[5, 3, 7], [2, 0, 10]])))


在給定上界之數值,常態分配之`.cdf()` 可用於計算該上界數值所對應之累積機率數值

print("cumulative probability given value with shape ():\n",
      normal.cdf(value=torch.Tensor([0])), "\n")
print("cumulative probability given value with (3,):\n",
      normal.cdf(value=torch.Tensor([-1, 0, .5])), "\n")
print("cumulative probability given value with (2,3):\n",
      normal.cdf(value=torch.Tensor([[-1, 0, .5], [-2, 1, 3]])))

不過,binomial分配並無 `cdf()` 方法可評估累積機率值。

### 分配物件之形狀
`pytorch` 分配物件之設計,乃參考 `tensorflow_probability`此套件,而分配物件在形狀上,牽涉到三類型之形狀:

1. 樣本形狀(sample shape):為用於描述獨立且具有相同分配隨機樣本之形狀,先前產生隨機樣本時,所設定的 `sample_shape` 即為樣本形狀。
2. 批次形狀(batch shape):為用於描述獨立,但不具有相同分配隨機樣本之形狀,其可以透過模型參數之形狀進行設定。
3. 事件形狀(event shape):為用於描述多變量分配之形狀,各變數間可能不具有統計獨立之特性。

先前產生的常態分配,其在 `batch_shape` 與 `event_shape` 上,皆為純量,故其數值為0-d之張量。
Ejemplo n.º 28
0
def normal_cdf(loc, sd):
    """normal cdf(0)"""
    # it is not jit-able
    d = Normal(loc, sd)
    return d.cdf(0)
Ejemplo n.º 29
0
    def plot(self,
             axes=None,
             block=False,
             Ndiv=100,
             legend=True,
             title="GPgrad",
             plotting=True,
             plotCDF=False,
             clear_axes=False,
             Nsamples=None,
             ylabel=None,
             ylim=None,
             pause=None,
             showtickslabels_x=True,
             xlabel=None,
             labelsize=None,
             showtickslabels=None,
             showticks=None,
             linewidth=None,
             color=None,
             prob=False):
        '''
		This function hardcodes the plotting limits between zero and one for now
		'''
        if plotting == False or self.dim > 1:
            return

        pp = PlotProbability()
        xpred_vec = torch.linspace(0.0, 1.0, Ndiv)[:, None]
        # xpred_vec = xpred_vec.unsqueeze(0) # Ndiv batches of [q=1 x self.dim] dimensions each

        # Compute one by one:
        logger.info("Computing posterior while plotting ... (!!)")
        post_batch = False
        if post_batch:

            # Predict:
            posterior = self.posterior(
                X=xpred_vec, observation_noise=False
            )  # observation_noise MUST be always false; this class is not prepared otherwise
            # Internally, self.posterior(xpred_vec) calls self(xpred_vec), which calls self.predictive(xpred_vec)

            # pdb.set_trace()

            # Get upper and lower confidence bounds (2 standard deviations from the mean):
            lower_ci, upper_ci = posterior.mvn.confidence_region()

            # Posterior mean:
            mean_vec = posterior.mean
            std_vec = posterior.variance.sqrt()

        else:

            lower_ci = torch.zeros((Ndiv))
            upper_ci = torch.zeros((Ndiv))
            mean_vec = torch.zeros((Ndiv))
            std_vec = torch.zeros((Ndiv))
            for k in range(Ndiv):
                mvn = self.predictive(xpred_vec[k, :].view(-1, self.dim))
                lower_ci[k], upper_ci[k] = mvn.confidence_region()
                mean_vec[k] = mvn.mean
                std_vec[k] = mvn.variance.sqrt()

        if self.dim == 1:
            if prob == False:
                axes = pp.plot_GP_1D(
                    xpred_vec=xpred_vec.squeeze().cpu().numpy(),
                    fpred_mode_vec=mean_vec.squeeze().detach().cpu().numpy(),
                    fpred_quan_minus=lower_ci.squeeze().detach().cpu().numpy(),
                    fpred_quan_plus=upper_ci.squeeze().detach().cpu().numpy(),
                    X_uns=self.train_xu.detach().cpu().numpy(),
                    X_sta=self.train_xs.detach().cpu().numpy(),
                    Y_sta=self.train_ys.detach().cpu().numpy(),
                    title=title,
                    axes=axes,
                    block=block,
                    legend=legend,
                    clear_axes=True,
                    xlabel=None,
                    ylabel=ylabel,
                    xlim=np.array([0., 1.]),
                    ylim=ylim,
                    labelsize="x-large",
                    legend_loc="upper left",
                    colormap="paper",
                    showtickslabels_x=showtickslabels_x)
            else:
                normal = Normal(
                    loc=mean_vec.squeeze(),
                    # scale=posterior.variance.sqrt().squeeze())
                    scale=std_vec.squeeze())
                ei_cdf = normal.cdf(self.threshold)
                # pdb.set_trace()
                axes = pp.plot_acquisition_function(
                    var_vec=ei_cdf,
                    xpred_vec=xpred_vec.cpu().numpy(),
                    xlabel=xlabel,
                    ylabel=ylabel,
                    title=title,
                    legend=legend,
                    axes=axes,
                    clear_axes=True,
                    xlim=np.array([0., 1.]),
                    block=block,
                    labelsize=labelsize,
                    showtickslabels=showtickslabels,
                    showticks=showticks,
                    what2plot="",
                    color=color,
                    ylim=np.array([0., 1.1]),
                    linewidth=linewidth)

            if Nsamples is not None:
                f_sample = posterior.sample(
                    sample_shape=torch.Size([Nsamples]))
                for k in range(Nsamples):
                    axes.plot(xpred_vec.squeeze().detach().cpu().numpy(),
                              f_sample[k, :, 0],
                              linestyle="--",
                              linewidth=1.0,
                              color="sienna")

        elif self.dim == 2:
            pass

        plt.show(block=block)
        if pause is not None:
            plt.pause(pause)

        return axes
Ejemplo n.º 30
0
ub = 4
lower_bound = torch.zeros(n_samples) + torch.Tensor([lb])
upper_bound = torch.zeros(n_samples) + torch.Tensor([ub])
samples = trandn((lower_bound - mus) / stds, (upper_bound - mus) / stds)
samples = samples * stds + mus
mean = samples.mean()

norm = Normal(0, 1)

alpha = -mus / stds
t = time.time()
for _ in range(300):
    alpha_log_pdf = norm.log_prob(alpha)
print("log_prob time:", time.time() - t)
alpha_pdf = torch.exp(alpha_log_pdf)
Z = 1 - norm.cdf(alpha)
theoretical_mean = mu + std * (alpha_pdf / Z)

t = time.time()
for _ in range(1):
    logZhat, Zhat, muHat, sigmaHat, entropy = moments(
        lower_bound, upper_bound,
        torch.Tensor([mu]).expand_as(lower_bound),
        torch.Tensor([std**2]).expand_as(lower_bound))
print("Robust time:", time.time() - t)
print("=============================\n\n")

print(
    f"Estimated mean: {mean}\nTheoretical mean: {theoretical_mean[0]}\nRobust evaluation of mean: {muHat.tolist()[0]}"
)
print("=============================\n\n")
Ejemplo n.º 31
0
class NormalUniform(Distribution):
    """
    A mixture of a Normal distribution and a Uniform distribution, defined over
    the interval -1 to 1. Whatever probability mass left over from the Normal
    distribution (outside -1 to 1) is converted into a Uniform.
    """
    arg_constraints = {'loc': constraints.real, 'scale': constraints.positive}
    support = constraints.interval(-1., 1.)
    has_rsample = False

    def __init__(self, loc, scale, validate_args=None):
        loc = torch.tanh(loc)
        self.loc, self.scale = broadcast_all(loc, scale)
        if isinstance(loc, Number) and isinstance(scale, Number):
            batch_shape = torch.Size()
        else:
            batch_shape = self.loc.size()
        super(NormalUniform, self).__init__(batch_shape,
                                            validate_args=validate_args)
        self.normal = Normal(self.loc, self.scale)
        dev = self.normal.loc.device
        self.low = -torch.ones(batch_shape, device=dev)
        self.high = torch.ones(batch_shape, device=dev)
        self.uniform = Uniform(self.low, self.high)
        normal_prob = self.normal.cdf(torch.ones(
            batch_shape, device=dev)) - self.normal.cdf(
                -torch.ones(batch_shape, device=dev))
        self.uniform_factor = 1 - normal_prob

    def log_prob(self, value):
        normal_prob = self.normal.log_prob(value).exp()
        uniform_prob = self.uniform_factor * self.uniform.log_prob(value).exp()
        return (normal_prob + uniform_prob +
                torch.finfo(value.dtype).eps).log()

    def sample(self, sample_shape=torch.Size()):
        shape = self._extended_shape(sample_shape)
        normal_sample = self.normal.sample(sample_shape)
        uniform_sample = self.uniform.sample(sample_shape)
        # check for places where the normal sample is outside [-1, 1]
        # and replace with a uniform sample
        dist_flag = ((normal_sample > 1) + (normal_sample < -1)).float()
        sample = (1. - dist_flag) * normal_sample + dist_flag * uniform_sample
        return sample

    def expand(self, batch_shape, _instance=None):
        new = self._get_checked_instance(NormalUniform, _instance)
        batch_shape = torch.Size(batch_shape)
        new.loc = self.loc.expand(batch_shape)
        new.scale = self.scale.expand(batch_shape)
        new.normal = Normal(new.loc, new.scale)
        new.low = self.low.expand(batch_shape)
        new.high = self.high.expand(batch_shape)
        new.uniform = Uniform(new.low, new.high)
        dev = new.normal.loc.device
        normal_prob = new.normal.cdf(torch.ones(
            batch_shape,
            device=dev)) - new.normal.cdf(-torch.ones(batch_shape, device=dev))
        new.uniform_factor = 1 - normal_prob
        super(NormalUniform, new).__init__(new.low,
                                           new.scale,
                                           validate_args=False)
        new._validate_args = self._validate_args
        return new

    def cdf(self, value):
        raise NotImplementedError

    def icdf(self, value):
        raise NotImplementedError

    def entropy(self):
        raise NotImplementedError
Ejemplo n.º 32
0
class MoE(nn.Module):
    """Call a Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
    Args:
    input_size: integer - size of the input
    output_size: integer - size of the input
    num_experts: an integer - number of experts
    hidden_size: an integer - hidden size of the experts
    noisy_gating: a boolean
    k: an integer - how many experts to use for each batch element
    """
    def __init__(self,
                 input_size,
                 hidden_size,
                 latent_dim,
                 output_size,
                 num_experts,
                 num_blocks=3,
                 noisy_gating=True,
                 k=4):
        super(MoE, self).__init__()
        self.noisy_gating = noisy_gating
        self.num_experts = num_experts
        self.output_size = output_size
        self.input_size = input_size
        self.latent_size = latent_dim
        self.hidden_size = hidden_size
        self.k = k

        action_size = output_size

        input_size = input_size - action_size  # Remove the action masking from the input to match sizes properly

        self.encoder = ResNet(input_size=input_size,
                              hidden_size=hidden_size,
                              output_size=latent_dim,
                              num_blocks=num_blocks)
        # instantiate experts
        self.experts = nn.ModuleList([
            ResNet(input_size=latent_dim,
                   hidden_size=hidden_size,
                   output_size=output_size,
                   num_blocks=num_blocks) for i in range(self.num_experts)
        ])
        self.value = ResNet(input_size=input_size,
                            hidden_size=hidden_size,
                            output_size=1,
                            num_blocks=num_blocks)
        self.w_gate = nn.Parameter(torch.zeros(latent_dim, num_experts),
                                   requires_grad=True)
        self.w_noise = nn.Parameter(torch.zeros(latent_dim, num_experts),
                                    requires_grad=True)

        self.softplus = nn.Softplus()
        self.softmax = nn.Softmax(1)
        self.normal = Normal(torch.tensor([0.0]), torch.tensor([1.0]))

        assert (self.k <= self.num_experts)

    def cv_squared(self, x):
        """The squared coefficient of variation of a sample.
        Useful as a loss to encourage a positive distribution to be more uniform.
        Epsilons added for numerical stability.
        Returns 0 for an empty Tensor.
        Args:
        x: a `Tensor`.
        Returns:
        a `Scalar`.
        """
        eps = 1e-10
        # if only num_experts = 1
        if x.shape[0] == 1:
            return torch.Tensor([0])
        return x.float().var() / (x.float().mean()**2 + eps)

    def _gates_to_load(self, gates):
        """Compute the true load per expert, given the gates.
        The load is the number of examples for which the corresponding gate is >0.
        Args:
        gates: a `Tensor` of shape [batch_size, n]
        Returns:
        a float32 `Tensor` of shape [n]
        """
        return (gates > 0).sum(0)

    def _prob_in_top_k(self, clean_values, noisy_values, noise_stddev,
                       noisy_top_values):
        """Helper function to NoisyTopKGating.
        Computes the probability that value is in top k, given different random noise.
        This gives us a way of backpropagating from a loss that balances the number
        of times each expert is in the top k experts per example.
        In the case of no noise, pass in None for noise_stddev, and the result will
        not be differentiable.
        Args:
        clean_values: a `Tensor` of shape [batch, n].
        noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
          normally distributed noise with standard deviation noise_stddev.
        noise_stddev: a `Tensor` of shape [batch, n], or None
        noisy_top_values: a `Tensor` of shape [batch, m].
           "values" Output of tf.top_k(noisy_top_values, m).  m >= k+1
        Returns:
        a `Tensor` of shape [batch, n].
        """
        batch = clean_values.size(0)
        m = noisy_top_values.size(1)
        top_values_flat = noisy_top_values.flatten()
        threshold_positions_if_in = torch.arange(batch) * m + self.k
        threshold_if_in = torch.unsqueeze(
            torch.gather(top_values_flat, 0, threshold_positions_if_in), 1)
        is_in = torch.gt(noisy_values, threshold_if_in)
        threshold_positions_if_out = threshold_positions_if_in - 1
        threshold_if_out = torch.unsqueeze(
            torch.gather(top_values_flat, 0, threshold_positions_if_out), 1)
        # is each value currently in the top k.
        prob_if_in = self.normal.cdf(
            (clean_values - threshold_if_in) / noise_stddev)
        prob_if_out = self.normal.cdf(
            (clean_values - threshold_if_out) / noise_stddev)
        prob = torch.where(is_in, prob_if_in, prob_if_out)
        return prob

    def noisy_top_k_gating(self, x, train, noise_epsilon=1e-2):
        """Noisy top-k gating.
          See paper: https://arxiv.org/abs/1701.06538.
          Args:
            x: input Tensor with shape [batch_size, input_size]
            train: a boolean - we only add noise at training time.
            noise_epsilon: a float
          Returns:
            gates: a Tensor with shape [batch_size, num_experts]
            load: a Tensor with shape [num_experts]
        """
        clean_logits = x @ self.w_gate
        if self.noisy_gating:
            raw_noise_stddev = x @ self.w_noise
            noise_stddev = ((self.softplus(raw_noise_stddev) + noise_epsilon) *
                            train)
            noisy_logits = clean_logits + (torch.randn_like(clean_logits) *
                                           noise_stddev)
            logits = noisy_logits
        else:
            logits = clean_logits

        # calculate topk + 1 that will be needed for the noisy gates
        top_logits, top_indices = logits.topk(min(self.k + 1,
                                                  self.num_experts),
                                              dim=1)
        top_k_logits = top_logits[:, :self.k]
        top_k_indices = top_indices[:, :self.k]
        top_k_gates = self.softmax(top_k_logits)

        zeros = torch.zeros_like(logits, requires_grad=True)
        gates = zeros.scatter(1, top_k_indices, top_k_gates)

        if self.noisy_gating and self.k < self.num_experts:
            load = (self._prob_in_top_k(clean_logits, noisy_logits,
                                        noise_stddev, top_logits)).sum(0)
        else:
            load = self._gates_to_load(gates)
        return gates, load

    def forward(self, observation, prev_action, prev_reward):
        """Args:
        x: tensor shape [batch_size, input_size]
        train: a boolean scalar.
        loss_coef: a scalar - multiplier on load-balancing losses
        Returns:
        y: a tensor with shape [batch_size, output_size].
        extra_training_loss: a scalar.  This should be added into the overall
        training loss of the model.  The backpropagation of this loss
        encourages all experts to be approximately equally used across a batch.
        """
        train = self.training
        observation = observation.float()

        # Infer (presence of) leading dimensions: [T,B], [B], or [].
        lead_dim, T, B, obs_shape = infer_leading_dims(observation, 1)
        observation = observation.view(T * B, *obs_shape)
        action_mask = observation[:, -19:].type(torch.bool)
        observation = observation[:, :-19]

        z = self.encoder(observation)
        gates, load = self.noisy_top_k_gating(z, train)

        dispatcher = SparseDispatcher(self.num_experts, gates)
        expert_inputs = dispatcher.dispatch(z)
        gates = dispatcher.expert_to_gates()
        expert_outputs = [
            self.experts[i](expert_inputs[i]) for i in range(self.num_experts)
        ]
        y = dispatcher.combine(expert_outputs)
        value = self.value(observation).squeeze(-1)
        y[~action_mask] = -1e24
        y = nn.functional.softmax(y, dim=-1)
        y, value = restore_leading_dims((y, value), lead_dim, T, B)
        return y, value

    def loss(self, observation, prev_action, prev_reward, loss_coef=1e-1):
        train = self.training
        observation = observation.float()

        lead_dim, T, B, obs_shape = infer_leading_dims(observation, 1)
        observation = observation.view(T * B, *obs_shape)
        action_mask = observation[:, -19:].type(torch.bool)
        observation = observation[:, :-19]

        z = self.encoder(observation)
        gates, load = self.noisy_top_k_gating(z, train)
        # calculate importance loss
        importance = gates.sum(0)
        loss = self.cv_squared(importance) + self.cv_squared(load)
        loss *= loss_coef
        return loss
Ejemplo n.º 33
0
class TanhNormal(Distribution):
    """
    Represent distribution of X where
        X ~ tanh(Z)
        Z ~ N(mean, std)

    Note: this is not very numerically stable.
    """
    def __init__(self, normal_mean, normal_std, epsilon=1e-6):
        """

        Args:
            normal_mean (Tensor): Mean of the normal distribution
            normal_std (Tensor): Std of the normal distribution
            epsilon (Double): Numerical stability epsilon when computing
                log-prob.
        """
        super(TanhNormal, self).__init__()
        self._normal_mean = normal_mean
        self._normal_std = normal_std
        self._normal = Normal(normal_mean, normal_std)
        self._epsilon = epsilon

    @property
    def mean(self):
        return self._normal.mean

    @property
    def variance(self):
        return self._normal.variance

    @property
    def stddev(self):
        return self._normal.stddev

    @property
    def epsilon(self):
        return self._epsilon

    def sample(self, return_pretanh_value=False):
        # z = self._normal.sample()
        z = self._normal.sample().detach()
        if return_pretanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def rsample(self, return_pretanh_value=False):
        z = self._normal.rsample()
        # z = (
        #     self._normal_mean +
        #     self._normal_std *
        #     Normal(
        #         ptu.zeros(self._normal_mean.size()),
        #         ptu.ones(self._normal_std.size()),
        #     ).sample()
        # )
        if return_pretanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def sample_n(self, n, return_pre_tanh_value=False):
        z = self._normal.sample_n(n)
        if return_pre_tanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def log_prob(self, value, pre_tanh_value=None):
        """
        Returns the log of the probability density function evaluated at
        `value`.

        Args:
            value (Tensor):
            pre_tanh_value (Tensor): arctan(value)

        Returns:
            log_prob (Tensor)

        """
        if pre_tanh_value is None:
            pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2

        return self._normal.log_prob(pre_tanh_value) - \
            torch.log(1. - value * value + self._epsilon)
        # return self.normal.log_prob(pre_tanh_value) - \
        #     torch.log(1. - torch.tanh(pre_tanh_value)**2 + self._epsilon)

    def cdf(self, value, pre_tanh_value=None):
        if pre_tanh_value is None:
            pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2
        return self._normal.cdf(pre_tanh_value)