Beispiel #1
0
    def __init__(self, tensor, gradient_update="S", rank=10):
        super().__init__()

        self.tensor = tensor
        self.num_train = len(tensor.train_vals)
        self.dims = tensor.dims
        self.ndim = len(self.dims)
        self.rank = rank
        self.datatype = tensor.datatype
        self.gradient_update = gradient_update

        self.means = ModuleList()
        self.chols = ModuleList()

        for dim, ncol in enumerate(self.dims):
            mean_list = ParameterList()
            cov_list  = ParameterList()
            for _ in range(ncol):
                mean_list.append(Parameter(torch.randn(rank), requires_grad=True))
                cov_list.append(Parameter(torch.ones(rank) + 1/4 * torch.randn(rank), requires_grad=True))

            self.means.append(mean_list)
            self.chols.append(cov_list)

        self.standard_multi_normal = MultivariateNormal(torch.zeros(rank), torch.eye(rank))
        self.sigma = 1
        self.batch_size = 64
        self.lambd = 1/self.batch_size
        self.round_robins_indices = [0 for _ in self.dims]
        self.k1 = 128
Beispiel #2
0
class TTConv2d(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 rank,
                 stride=1,
                 padding=0,
                 dilation=1,
                 alpha=1,
                 beta=0.1,
                 **kwargs):
        # increase beta to decrease rank
        super(TTConv2d, self).__init__()
        assert (len(in_channels) == len(in_channels))
        assert (len(rank) == len(in_channels) - 1)
        self.in_channels = list(in_channels)
        self.out_channels = list(out_channels)
        self.rank = list(rank)
        self.factors = ParameterList()

        r1 = [1] + self.rank[:-1]
        r2 = self.rank
        for ri, ro, si, so in zip(r1, r2, in_channels[:-1], out_channels[:-1]):
            p = Parameter(torch.Tensor(ri, si, so, ro))
            self.factors.append(p)
        self.bias = Parameter(torch.Tensor(np.prod(out_channels)))

        self.conv = nn.Conv2d(in_channels=self.rank[-1] * in_channels[-1],
                              out_channels=out_channels[-1],
                              kernel_size=kernel_size,
                              stride=stride,
                              padding=padding,
                              dilation=dilation,
                              bias=False)

        self.lamb = ParameterList([Parameter(torch.ones(r)) for r in rank])
        self.alpha = Parameter(torch.tensor(alpha), requires_grad=False)
        self.beta = Parameter(torch.tensor(beta), requires_grad=False)

        self._initialize_weights()

    def forward(self, x):
        def mode2_dot(tensor, matrix, mode):
            ms = matrix.shape
            matrix = matrix.reshape(ms[0] * ms[1], ms[2] * ms[3])

            sp = list(tensor.shape)
            sp[mode:mode + 2] = [sp[mode] * sp[mode + 1], 1]

            sn = list(tensor.shape)
            sn[mode:mode + 2] = ms[2:4]

            tensor = tensor.reshape(sp)
            tensor = tl.tenalg.mode_dot(tensor, matrix.t(), mode)
            return tensor.reshape(sn)

        (b, c, h, w) = x.shape
        x = x.reshape((x.shape[0], 1, *self.in_channels, h, w))
        for (i, f) in enumerate(self.factors):
            x = mode2_dot(x, f, i + 1)
        x = x.reshape((b * np.prod(self.out_channels[:-1]),
                       self.rank[-1] * self.in_channels[-1], h, w))
        x = self.conv(x)
        x = x.reshape((b, np.prod(self.out_channels), h, w))
        x = x + self.bias.reshape((1, -1, 1, 1))
        return x

    def _initialize_weights(self):
        for f in self.factors:
            nn.init.kaiming_uniform_(f)
        nn.init.constant_(self.bias, 0)

    def regularizer(self, exp=True):
        ret = 0
        if exp:
            for i in range(len(self.rank)):
                # ret += torch.sum(torch.sum(self.factors[i]**2, dim=[0, 1, 2])
                #     * torch.exp(self.lamb[i]) / 2)
                m = torch.sum(self.factors[i]**2, dim=[1, 2])
                if i > 0:
                    m = m * torch.exp(self.lamb[i-1]).reshape([-1, 1]) \
                        / np.exp(self.get_lamb_ths(exp)[i-1])
                m = m * torch.exp(self.lamb[i]).reshape([1, -1])
                ret += torch.sum(m, dim=[0, 1]) / 2
                ret -= np.prod(self.factors[i].shape[:-1]) \
                    * torch.sum(self.lamb[i]) / 2
                if i != len(self.rank) - 1:
                    # ret += torch.sum(torch.sum(self.factors[i+1]**2, dim=[1, 2, 3])
                    #     * torch.exp(self.lamb[i] / 2))
                    ret -= np.prod(self.factors[i+1].shape[1:]) \
                        * torch.sum(self.lamb[i]) / 2
                else:
                    w = self.conv.weight.transpose(0, 1)
                    w = w.reshape(self.rank[i], -1)
                    ret += torch.sum(
                        torch.sum(w**2, dim=1) * torch.exp(self.lamb[i]) / 2)
                    ret -= w.shape[1] * torch.sum(self.lamb[i]) / 2

                ret -= torch.sum(self.alpha * self.lamb[i])
                ret += torch.sum(torch.exp(self.lamb[i])) / self.beta

        else:
            for i in range(len(self.rank) - 1):
                self.lamb[i].data.clamp_min_(1e-6)
                ret += torch.sum(
                    torch.sum(self.factors[i]**2, dim=[0, 1, 2]) /
                    self.lamb[i] / 2)
                ret += np.prod(self.factors[i].shape[:-1]) \
                    * torch.sum(torch.log(self.lamb[i])) / 2
                if i != len(self.rank) - 1:
                    ret += torch.sum(
                        torch.sum(self.factors[i + 1]**2, dim=[1, 2, 3]) /
                        self.lamb[i] / 2)
                    ret += np.prod(self.factors[i+1].shape[1:]) \
                        * torch.sum(torch.log(self.lamb[i])) / 2
                else:
                    w = self.conv.weight.transpose(0, 1)
                    w = w.reshape(self.rank[i], -1)
                    ret += torch.sum(torch.sum(w**2, dim=1) / self.lamb[i] / 2)
                    ret += w.shape[1] * torch.sum(torch.log(self.lamb[i])) / 2

                ret += torch.sum(self.beta / self.lamb[i])
                ret += (self.alpha + 1) * torch.sum(torch.log(self.lamb[i]))

        return ret

    def get_lamb_ths(self, exp=True):
        if (exp):
            lamb_ths = [
                np.log((np.prod(self.factors[i].shape[:-1]) / 2 +
                        np.prod(self.factors[i + 1].shape[1:]) / 2 +
                        self.alpha.item()) * self.beta.item())
                for i in range(len(self.lamb) - 1)
            ]
            lamb_ths.append(
                np.log(
                    (np.prod(self.factors[-1].shape[:-1]) / 2 +
                     (self.out_channels[-1] * self.in_channels[-1] *
                      self.conv.weight.shape[2] * self.conv.weight.shape[3]) /
                     2 + self.alpha.item()) * self.beta.item()))
        else:
            lamb_ths = [
                self.beta.item() /
                (np.prod(self.factors[i].shape[:-1]) / 2 +
                 np.prod(self.factors[i + 1].shape[1:]) / 2 +
                 self.alpha.item() + 1) for i in range(len(self.lamb) - 1)
            ]
            lamb_ths.append(
                self.beta.item() /
                (np.prod(self.factors[-1].shape[:-1]) / 2 +
                 (self.out_channels[-1] * self.in_channels[-1] *
                  self.conv.weight.shape[2] * self.conv.weight.shape[3]) / 2 +
                 self.alpha.item() + 1))
        return lamb_ths
Beispiel #3
0
class TTlinear(nn.Module):
    def __init__(self, in_size, out_size, rank, alpha=1, beta=0.1, **kwargs):
        # increase beta to decrease rank
        super(TTlinear, self).__init__()
        assert (len(in_size) == len(out_size))
        assert (len(rank) == len(in_size) - 1)
        self.in_size = list(in_size)
        self.out_size = list(out_size)
        self.rank = list(rank)
        self.factors = ParameterList()
        r1 = [1] + list(rank)
        r2 = list(rank) + [1]
        for ri, ro, si, so in zip(r1, r2, in_size, out_size):
            p = Parameter(torch.Tensor(ri, si, so, ro))
            self.factors.append(p)
        self.bias = Parameter(torch.Tensor(np.prod(out_size)))
        self.lamb = ParameterList([Parameter(torch.ones(r)) for r in rank])
        self.alpha = Parameter(torch.tensor(alpha), requires_grad=False)
        self.beta = Parameter(torch.tensor(beta), requires_grad=False)

        self._initialize_weights()

    def forward(self, x):
        def mode2_dot(tensor, matrix, mode):
            ms = matrix.shape
            matrix = matrix.reshape(ms[0] * ms[1], ms[2] * ms[3])

            sp = list(tensor.shape)
            sp[mode:mode + 2] = [sp[mode] * sp[mode + 1], 1]

            sn = list(tensor.shape)
            sn[mode:mode + 2] = ms[2:4]

            tensor = tensor.reshape(sp)
            tensor = tl.tenalg.mode_dot(tensor, matrix.t(), mode)
            return tensor.reshape(sn)

        x = x.reshape((x.shape[0], 1, *self.in_size))
        for (i, f) in enumerate(self.factors):
            x = mode2_dot(x, f, i + 1)
        x = x.reshape((x.shape[0], -1))
        x = x + self.bias
        return x

    def _initialize_weights(self):
        for f in self.factors:
            nn.init.kaiming_uniform_(f)
        nn.init.constant_(self.bias, 0)

    def regularizer(self, exp=True):
        ret = 0
        if exp:
            for i in range(len(self.rank)):
                # ret += torch.sum(torch.sum(self.factors[i]**2, dim=[0, 1, 2])
                # * torch.exp(self.lamb[i]) / 2)
                ret -= np.prod(self.factors[i].shape[:-1]) \
                    * torch.sum(self.lamb[i]) / 2
                # ret += torch.sum(torch.sum(self.factors[i+1]**2, dim=[1, 2, 3])
                # * torch.exp(self.lamb[i] / 2))
                ret -= np.prod(self.factors[i+1].shape[1:]) \
                     * torch.sum(self.lamb[i]) / 2
                ret -= torch.sum(self.alpha * self.lamb[i])
                ret += torch.sum(torch.exp(self.lamb[i])) / self.beta

            for i in range(len(self.rank) + 1):
                m = torch.sum(self.factors[i]**2, dim=[1, 2])
                if i > 0:
                    m = m * torch.exp(self.lamb[i - 1]).reshape([-1, 1])
                if i < len(self.rank):
                    m = m * torch.exp(self.lamb[i]).reshape([1, -1])
                ret += torch.sum(m, dim=[0, 1]) / 2

        else:
            for i in range(len(self.rank)):
                self.lamb[i].data.clamp_min_(1e-6)
                ret += torch.sum(
                    torch.sum(self.factors[i]**2, dim=[0, 1, 2]) /
                    self.lamb[i] / 2)
                ret += np.prod(self.factors[i].shape[:-1]) \
                    * torch.sum(torch.log(self.lamb[i])) / 2
                ret += torch.sum(
                    torch.sum(self.factors[i + 1]**2, dim=[1, 2, 3]) /
                    self.lamb[i] / 2)
                ret += np.prod(self.factors[i+1].shape[1:]) \
                    * torch.sum(torch.log(self.lamb[i])) / 2

                ret += torch.sum(self.beta / self.lamb[i])
                ret += (self.alpha + 1) * torch.sum(torch.log(self.lamb[i]))

        return ret

    def get_lamb_ths(self, exp=True):
        if (exp):
            lamb_ths = [
                np.log((np.prod(self.factors[i].shape[:-1]) / 2 +
                        np.prod(self.factors[i + 1].shape[1:]) / 2 +
                        self.alpha.item()) * self.beta.item())
                for i in range(len(self.lamb))
            ]
        else:
            lamb_ths = [
                self.beta.item() /
                (np.prod(self.factors[i].shape[:-1]) / 2 +
                 np.prod(self.factors[i + 1].shape[1:]) / 2 +
                 self.alpha.item() + 1) for i in range(len(self.lamb))
            ]
        return lamb_ths
Beispiel #4
0
class LogReg(torch.nn.Module):
    """
    Class to generate weights
    """
    def _setup_weights(self):
        """
        Initializes weight tensor with random values
        ties init and dc weights if specified
        :return: Null
        """
        torch.manual_seed(42)
        # setup init
        self.weight_tensors = ParameterList()
        self.tensor_tuple = ()
        self.feature_id = []
        self.W = None
        for featurizer in self.featurizers:
            self.feature_id.append(featurizer.id)
            if featurizer.id == 'SignalInit':
                if self.tie_init:
                    signals_W = Parameter(
                        torch.randn(1).expand(1, self.output_dim))
                else:
                    signals_W = Parameter(torch.randn(1, self.output_dim))
            elif featurizer.id == 'SignalDC':
                if self.tie_dc:
                    signals_W = Parameter(
                        torch.randn(featurizer.count,
                                    1).expand(-1, self.output_dim))
                else:
                    signals_W = Parameter(
                        torch.randn(featurizer.count, self.output_dim))
            else:
                signals_W = Parameter(
                    torch.randn(featurizer.count,
                                1).expand(-1, self.output_dim))
            self.weight_tensors.append(signals_W)
        return

    def __init__(self, featurizers, output_dim, tie_init, tie_dc):
        """
        Constructor for our logistic regression
        :param featurizers: a list of featurizer modules
        :param output_dim: number of classes
        :param tie_init: boolean, determines weight tying for init features
        :param tie_dc: boolean, determines weight tying for dc features
        """
        super(LogReg, self).__init__()

        self.featurizers = featurizers

        self.output_dim = output_dim

        self.tie_init = tie_init
        self.tie_dc = tie_dc

        self._setup_weights()

    def forward(self, X, index, mask):
        """
        Runs the forward pass of our logreg.
        :param X: values of the features
        :param index: indices to mask at
        :param mask: tensor to remove possibility of choosing unused class
        :return: output - X * W after masking
        """

        # Reties the weights - need to do on every pass

        self.concat_weights()

        # Calculates n x l matrix output
        output = X.mul(self.W)
        output = output.sum(1)
        # Changes values to extremely negative at specified indices
        if index is not None and mask is not None:
            output.index_add_(0, index, mask)
        return output

    def concat_weights(self):
        """
        Reties the weight tensor
        """
        for feature_index in range(0, len(self.weight_tensors)):
            if self.feature_id[feature_index] == 'SignalInit':
                tensor = self.weight_tensors[feature_index].expand(
                    1, self.output_dim)
            elif self.feature_id[feature_index] == 'SignalDC':
                tensor = self.weight_tensors[feature_index].expand(
                    -1, self.output_dim)
            else:
                tensor = self.weight_tensors[feature_index].expand(
                    -1, self.output_dim)
            if feature_index == 0:
                self.W = tensor + 0
            else:
                self.W = torch.cat((self.W, tensor), 0)
Beispiel #5
0
class HigherOrderGP(BatchedMultiOutputGPyTorchModel, ExactGP):
    r"""
    A Higher order Gaussian process model (HOGP) (predictions are matrices/tensors) as
    described in [Zhe2019hogp]_.
    """

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_modules: Optional[List[Kernel]] = None,
        num_latent_dims: Optional[List[int]] = None,
        learn_latent_pars: bool = True,
        latent_init: str = "default",
        outcome_transform: Optional[OutcomeTransform] = None,
        input_transform: Optional[InputTransform] = None,
    ):
        r"""A HigherOrderGP model for high-dim output regression.

        Args:
            train_X: A `batch_shape x n x d`-dim tensor of training inputs.
            train_Y: A `batch_shape x n x output_shape`-dim tensor of training targets.
            likelihood: Gaussian likelihood for the model.
            covar_modules: List of kernels for each output structure.
            num_latent_dims: Sizes for the latent dimensions.
            learn_latent_pars: If true, learn the latent parameters.
            latent_init: [default or gp] how to initialize the latent parameters.
        """

        if input_transform is not None:
            input_transform.to(train_X)

        # infer the dimension of `output_shape`.
        num_output_dims = train_Y.dim() - train_X.dim() + 1
        batch_shape = train_X.shape[:-2]
        if len(batch_shape) > 1:
            raise NotImplementedError(
                "HigherOrderGP currently only supports 1-dim `batch_shape`."
            )

        if outcome_transform is not None:
            if isinstance(outcome_transform, Standardize) and not isinstance(
                outcome_transform, FlattenedStandardize
            ):
                warnings.warn(
                    "HigherOrderGP does not support the outcome_transform "
                    "`Standardize`! Using `FlattenedStandardize` with `output_shape="
                    f"{train_Y.shape[- num_output_dims:]} and batch_shape="
                    f"{batch_shape} instead.",
                    RuntimeWarning,
                )
                outcome_transform = FlattenedStandardize(
                    output_shape=train_Y.shape[-num_output_dims:],
                    batch_shape=batch_shape,
                )
            train_Y, _ = outcome_transform(train_Y)

        self._aug_batch_shape = batch_shape
        self._num_dimensions = num_output_dims + 1
        self._num_outputs = train_Y.shape[0] if batch_shape else 1
        self.target_shape = train_Y.shape[-num_output_dims:]
        self._input_batch_shape = batch_shape

        if likelihood is None:

            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True

        super().__init__(
            train_X,
            train_Y.view(*self._aug_batch_shape, -1),
            likelihood=likelihood,
        )

        if covar_modules is not None:
            self.covar_modules = ModuleList(covar_modules)
        else:
            self.covar_modules = ModuleList(
                [
                    MaternKernel(
                        nu=2.5,
                        lengthscale_prior=GammaPrior(3.0, 6.0),
                        batch_shape=self._aug_batch_shape,
                        ard_num_dims=1 if dim > 0 else train_X.shape[-1],
                    )
                    for dim in range(self._num_dimensions)
                ]
            )

        if num_latent_dims is None:
            num_latent_dims = [1] * (self._num_dimensions - 1)

        self.to(train_X.device)

        self._initialize_latents(
            latent_init=latent_init,
            num_latent_dims=num_latent_dims,
            learn_latent_pars=learn_latent_pars,
            device=train_Y.device,
            dtype=train_Y.dtype,
        )

        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        if input_transform is not None:
            self.input_transform = input_transform

    def _initialize_latents(
        self,
        latent_init: str,
        num_latent_dims: List[int],
        learn_latent_pars: bool,
        device: torch.device,
        dtype: torch.dtype,
    ):
        self.latent_parameters = ParameterList()
        if latent_init == "default":
            for dim_num in range(len(self.covar_modules) - 1):
                self.latent_parameters.append(
                    Parameter(
                        torch.rand(
                            *self._aug_batch_shape,
                            self.target_shape[dim_num],
                            num_latent_dims[dim_num],
                            device=device,
                            dtype=dtype,
                        ),
                        requires_grad=learn_latent_pars,
                    )
                )
        elif latent_init == "gp":
            for dim_num, covar in enumerate(self.covar_modules[1:]):
                latent_covar = covar(
                    torch.linspace(
                        0.0,
                        1.0,
                        self.target_shape[dim_num],
                        device=device,
                        dtype=dtype,
                    )
                ).add_jitter(1e-4)
                latent_dist = MultivariateNormal(
                    torch.zeros(
                        self.target_shape[dim_num],
                        device=device,
                        dtype=dtype,
                    ),
                    latent_covar,
                )
                sample_shape = torch.Size(
                    (
                        *self._aug_batch_shape,
                        num_latent_dims[dim_num],
                    )
                )
                latent_sample = latent_dist.sample(sample_shape=sample_shape)
                latent_sample = latent_sample.reshape(
                    *self._aug_batch_shape,
                    self.target_shape[dim_num],
                    num_latent_dims[dim_num],
                )
                self.latent_parameters.append(
                    Parameter(
                        latent_sample,
                        requires_grad=learn_latent_pars,
                    )
                )
                self.register_prior(
                    "latent_parameters_" + str(dim_num),
                    MultivariateNormalPrior(
                        latent_dist.loc, latent_dist.covariance_matrix.detach().clone()
                    ),
                    lambda module, dim_num=dim_num: self.latent_parameters[dim_num],
                )

    def forward(self, X: Tensor) -> MultivariateNormal:
        X = self.transform_inputs(X)

        covariance_list = []
        covariance_list.append(self.covar_modules[0](X))

        for cm, param in zip(self.covar_modules[1:], self.latent_parameters):
            covariance_list.append(cm(param))

        # check batch_shapes
        if covariance_list[0].batch_shape != covariance_list[1].batch_shape:
            for i in range(1, len(covariance_list)):
                cm = covariance_list[i]
                covariance_list[i] = BatchRepeatLazyTensor(
                    cm, covariance_list[0].batch_shape
                )
        kronecker_covariance = KroneckerProductLazyTensor(*covariance_list)

        # TODO: expand options for the mean module via batch shaping?
        mean = torch.zeros(
            *covariance_list[0].batch_shape,
            kronecker_covariance.shape[-1],
            device=kronecker_covariance.device,
            dtype=kronecker_covariance.dtype,
        )
        return MultivariateNormal(mean, kronecker_covariance)

    def get_fantasy_model(self, inputs, targets, **kwargs):
        # we need to squeeze the targets in order to preserve the shaping
        inputs_batch_dims = len(inputs.shape[:-2])
        target_shape = (*inputs.shape[:-2], -1)
        if (inputs_batch_dims + self._num_dimensions) < targets.ndim:
            target_shape = (targets.shape[0], *target_shape)
        reshaped_targets = targets.view(*target_shape)

        return super().get_fantasy_model(inputs, reshaped_targets, **kwargs)

    def condition_on_observations(
        self, X: Tensor, Y: Tensor, **kwargs: Any
    ) -> HigherOrderGP:
        r"""Condition the model on new observations.
        Args:
            X: A `batch_shape x n' x d`-dim Tensor, where `d` is the dimension of
            the feature space, `m` is the number of points per batch, and
            `batch_shape` is the batch shape (must be compatible with the
            batch shape of the model).

            Y: A `batch_shape' x n' x m_d`-dim Tensor, where `m_d` is the shaping
            of the model outputs, `n'` is the number of points per batch, and
            `batch_shape'` is the batch shape of the observations.
            `batch_shape'` must be broadcastable to `batch_shape` using
            standard broadcasting semantics. If `Y` has fewer batch dimensions
            than `X`, its is assumed that the missing batch dimensions are
            the same for all `Y`.
        Returns:
            A `BatchedMultiOutputGPyTorchModel` object of the same type with
            `n + n'` training examples, representing the original model
            conditioned on the new observations `(X, Y)` (and possibly noise
            observations passed in via kwargs).
        """
        noise = kwargs.get("noise")
        if hasattr(self, "outcome_transform"):
            # we need to apply transforms before shifting batch indices around
            Y, noise = self.outcome_transform(Y, noise)
        self._validate_tensor_args(X=X, Y=Y, Yvar=noise, strict=False)

        # we don't need to do un-squeezing because Y already is batched
        # we don't support fixed noise here yet
        # if noise is not None:
        #     kwargs.update({"noise": noise})
        fantasy_model = super(
            BatchedMultiOutputGPyTorchModel, self
        ).condition_on_observations(X=X, Y=Y, **kwargs)
        fantasy_model._input_batch_shape = fantasy_model.train_targets.shape[
            : (-1 if self._num_outputs == 1 else -2)
        ]
        fantasy_model._aug_batch_shape = fantasy_model.train_targets.shape[:-1]
        return fantasy_model

    def posterior(
        self,
        X: Tensor,
        output_indices: Optional[List[int]] = None,
        observation_noise: Union[bool, Tensor] = False,
        **kwargs: Any,
    ) -> GPyTorchPosterior:
        self.eval()  # make sure we're calling a posterior

        no_pred_variance = skip_posterior_variances._state

        with ExitStack() as es:
            es.enter_context(gpt_posterior_settings())
            es.enter_context(fast_pred_var(True))

            # we need to skip posterior variances here
            es.enter_context(skip_posterior_variances(True))
            mvn = self(X)
            if observation_noise is not False:
                # TODO: implement Kronecker + diagonal solves so that this is possible.
                # if torch.is_tensor(observation_noise):
                #     # TODO: Validate noise shape
                #     # make observation_noise `batch_shape x q x n`
                #     obs_noise = observation_noise.transpose(-1, -2)
                #     mvn = self.likelihood(mvn, X, noise=obs_noise)
                # elif isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
                #     noise = self.likelihood.noise.mean().expand(X.shape[:-1])
                #     mvn = self.likelihood(mvn, X, noise=noise)
                # else:
                mvn = self.likelihood(mvn, X)

            # lazy covariance matrix includes the interpolated version of the full
            # covariance matrix so we can actually grab that instead.
            if X.ndimension() > self.train_inputs[0].ndimension():
                X_batch_shape = X.shape[:-2]
                train_inputs = self.train_inputs[0].reshape(
                    *[1] * len(X_batch_shape), *self.train_inputs[0].shape
                )
                train_inputs = train_inputs.repeat(
                    *X_batch_shape, *[1] * self.train_inputs[0].ndimension()
                )
            else:
                train_inputs = self.train_inputs[0]
            full_covar = self.covar_modules[0](torch.cat((train_inputs, X), dim=-2))

            if no_pred_variance:
                pred_variance = mvn.variance
            else:
                joint_covar = self._get_joint_covariance([X])
                pred_variance = self.make_posterior_variances(joint_covar)

                full_covar = KroneckerProductLazyTensor(
                    full_covar, *joint_covar.lazy_tensors[1:]
                )

            joint_covar_list = [self.covar_modules[0](X, train_inputs)]
            batch_shape = joint_covar_list[0].batch_shape
            for cm, param in zip(self.covar_modules[1:], self.latent_parameters):
                covar = cm(param)
                if covar.batch_shape != batch_shape:
                    covar = BatchRepeatLazyTensor(covar, batch_shape)
                joint_covar_list.append(covar)

            test_train_covar = KroneckerProductLazyTensor(*joint_covar_list)

            # mean and variance get reshaped into the target shape
            new_mean = mvn.mean.reshape(*X.shape[:-1], *self.target_shape)
            if not no_pred_variance:
                new_variance = pred_variance.reshape(*X.shape[:-1], *self.target_shape)
                new_variance = DiagLazyTensor(new_variance)
            else:
                new_variance = ZeroLazyTensor(
                    *X.shape[:-1], *self.target_shape, self.target_shape[-1]
                )

            mvn = MultivariateNormal(new_mean, new_variance)

            # return a specialized Posterior to allow for sampling
            posterior = HigherOrderGPPosterior(
                mvn=mvn,
                train_targets=self.train_targets.unsqueeze(-1),
                train_train_covar=self.prediction_strategy.lik_train_train_covar,
                test_train_covar=test_train_covar,
                joint_covariance_matrix=full_covar,
                output_shape=Size(
                    (
                        *X.shape[:-1],
                        *self.target_shape,
                    )
                ),
                num_outputs=self._num_outputs,
            )
            if hasattr(self, "outcome_transform"):
                posterior = self.outcome_transform.untransform_posterior(posterior)

            return posterior

    # TODO: remove when this gets exposed in gpytorch
    def _get_joint_covariance(self, inputs):
        """
        Internal method to expose the joint test train covariance.
        """

        from gpytorch.models import ExactGP
        from gpytorch.utils.broadcasting import _mul_broadcast_shape

        train_inputs = self.train_inputs
        # Concatenate the input to the training input
        full_inputs = []
        batch_shape = train_inputs[0].shape[:-2]
        for train_input, input in zip(train_inputs, inputs):
            # Make sure the batch shapes agree for training/test data
            # This seems to be deprecated
            # if batch_shape != train_input.shape[:-2]:
            #     batch_shape = _mul_broadcast_shape(
            #         batch_shape, train_input.shape[:-2]
            #     )
            #     train_input = train_input.expand(
            #         *batch_shape, *train_input.shape[-2:]
            #     )
            if batch_shape != input.shape[:-2]:
                batch_shape = _mul_broadcast_shape(batch_shape, input.shape[:-2])
                train_input = train_input.expand(*batch_shape, *train_input.shape[-2:])
                input = input.expand(*batch_shape, *input.shape[-2:])
            full_inputs.append(torch.cat([train_input, input], dim=-2))

        # Get the joint distribution for training/test data
        full_output = super(ExactGP, self).__call__(*full_inputs)
        return full_output.lazy_covariance_matrix

    def make_posterior_variances(self, joint_covariance_matrix: LazyTensor) -> Tensor:
        r"""
        Computes the posterior variances given the data points X. As currently
        implemented, it computes another forwards call with the stacked data to get out
        the joint covariance across all data points.
        """
        # TODO: use the exposed joint covariances from the prediction strategy
        data_joint_covariance = joint_covariance_matrix.lazy_tensors[
            0
        ].evaluate_kernel()
        num_train = self.train_inputs[0].shape[-2]
        test_train_covar = data_joint_covariance[..., num_train:, :num_train]
        train_train_covar = data_joint_covariance[..., :num_train, :num_train]
        test_test_covar = data_joint_covariance[..., num_train:, num_train:]

        full_train_train_covar = KroneckerProductLazyTensor(
            train_train_covar, *joint_covariance_matrix.lazy_tensors[1:]
        )
        full_test_test_covar = KroneckerProductLazyTensor(
            test_test_covar, *joint_covariance_matrix.lazy_tensors[1:]
        )
        full_test_train_covar_list = [test_train_covar] + [
            *joint_covariance_matrix.lazy_tensors[1:]
        ]

        train_evals, train_evecs = full_train_train_covar.symeig(eigenvectors=True)
        # (\kron \Lambda_i + \sigma^2 I)^{-1}
        train_inv_evals = DiagLazyTensor(1.0 / (train_evals + self.likelihood.noise))

        # compute K_i S_i \hadamard K_i S_i
        test_train_hadamard = KroneckerProductLazyTensor(
            *[
                lt1.matmul(lt2).evaluate() ** 2
                for lt1, lt2 in zip(
                    full_test_train_covar_list, train_evecs.lazy_tensors
                )
            ]
        )

        # and compute the column sums of
        #  (\kron K_i S_i * K_i S_i) \tilde{\Lambda}^{-1}
        test_train_pred_covar = test_train_hadamard.matmul(train_inv_evals).sum(dim=-1)

        pred_variances = full_test_test_covar.diag() - test_train_pred_covar
        return pred_variances
Beispiel #6
0
class TiedLinear(torch.nn.Module):
    """
    TiedLinear is a linear layer with shared parameters for features between
    (output) classes that takes as input a tensor X with dimensions
        (batch size) X (max_domain) X (total # of features)
        where:
            max_domain is the desired output dimension/# of classes
    """

    def __init__(self, env, feat_info, max_domain, layer_sizes, bias=False):
        """
        feat_info (list[FeatInfo]): list of FeatInfo namedtuples for each
            featurizer
        max_domain (int): number of domain values (e.g. max domain)
        bias (bool): use bias on the first layer per feature.
        layer_sizes (list[int]): Output size of each linear layer. Last layer
            should have an output size of 1. E.g. [200, 1].
        """
        assert layer_sizes and layer_sizes[-1] == 1

        super(TiedLinear, self).__init__()
        self.env = env
        self.act = ReLU()

        self.max_domain = max_domain
        self.bias_flag = bias

        # Create first layer: this layer is special since some weights
        # cannot be learned.
        self.first_layer_weights = ParameterList()
        for feat in feat_info:
            weight = Parameter(feat.init_weight*torch.ones(feat.size,
                                                           layer_sizes[0]),
                               requires_grad=feat.learnable)
            if feat.learnable:
                torch.nn.init.xavier_uniform_(weight)
            self.first_layer_weights.append(weight)

        if self.bias_flag:
            self.first_layer_bias = Parameter(torch.zeros(1, sum(f.size for f in feat_info)))
            torch.nn.init.xavier_uniform_(self.first_layer_bias)

        # Create subsequent layers.
        self.other_weights = ParameterList()
        self.other_biases = ParameterList()
        for in_dim, out_dim in zip(layer_sizes[:-1], layer_sizes[1:]):
            weight = Parameter(torch.zeros(in_dim, out_dim))
            # (max_domain, out_dim)
            bias = Parameter(torch.zeros(1, out_dim))

            # Randomly initialize weights.
            torch.nn.init.xavier_uniform_(weight)
            torch.nn.init.xavier_uniform_(bias)

            self.other_weights.append(weight)
            self.other_biases.append(bias)

        logging.debug("training model with first layer size: %s",
                      list(map(str, [w.shape for w in self.first_layer_weights])))
        if len(self.other_weights):
            logging.debug("training model with additional hidden layers of size: %s",
                          list(map(str, [w.shape for w in self.other_weights])))

    def forward(self, X, index, mask):
        """
        Performs one forward pass and outputs the logits of size
        (batch, max_domain)

        X: (batch, # of classes, total # of features)
        index: (batch)
        mask: (batch, # of classes)
        """
        if X.shape[0] == 0:
            logging.warning("performing forward pass with no samples")
            return torch.zeros(0, X.shape[1])

        # Multiply through the first layer.
        # (batch, # of classes, layers_size[0])
        output = X.matmul(torch.cat([t for t in self.first_layer_weights],
                                    dim=0))
        if self.bias_flag:
            output.add_(self.first_layer_bias.expand(self.max_domain, -1))

        for idx, (weight, bias) in enumerate(zip(self.other_weights,
                                                 self.other_biases)):
            # Apply activation on all but last layer.
            output = self.act(output)
            # (batch, # of classes, in_dim) --> (batch, # of classes, out_dim)
            output = output.matmul(weight) + bias.expand(self.max_domain, -1)
        # output should now be (batch, # of classes, 1)

        # (batch, # of classes)
        output = output.squeeze(-1)

        # Add our mask so that invalid domain classes for a given variable/VID
        # has a large negative value, resulting in a softmax probability
        # of de facto 0.
        # (batch, # of classes)
        output.index_add_(0, index, mask)
        return output
Beispiel #7
0
class TTlinear(nn.Module):
    def __init__(self,
                 in_size,
                 out_size,
                 rank,
                 prior_type=None,
                 prior_para=[],
                 em_stepsize=0.1,
                 **kwargs):
        # increase beta to decrease rank
        super(TTlinear, self).__init__()
        assert (len(in_size) == len(out_size))
        assert (len(rank) == len(in_size) - 1)
        self.in_size = list(in_size)
        self.out_size = list(out_size)
        self.rank = list(rank)
        self.factors = ParameterList()
        r1 = [1] + list(rank)
        r2 = list(rank) + [1]
        for ri, ro, si, so in zip(r1, r2, in_size, out_size):
            p = Parameter(torch.Tensor(ri, so, si, ro))
            self.factors.append(p)
        self.bias = Parameter(torch.Tensor(np.prod(out_size)))
        self._initialize_weights()

        self.prior_type = prior_type
        self.prior_para = prior_para.copy()
        self.rank_parameters = ParameterList(
            [Parameter(torch.ones(r), requires_grad=False) for r in rank])
        self.em_stepsize = em_stepsize
        self.mask = ParameterList(
            [Parameter(torch.ones(r), requires_grad=False) for r in rank])

        dim = len(in_size)
        self.register_buffer(
            'shift',
            torch.zeros(dim * 2 + dim * (dim + 1) // 2, requires_grad=True))
        self.shift_adj = torch.zeros_like(self.shift)

    def forward(self, x):
        return tt_nn_fcn.apply(x, self.bias, self.shift, *list(self.factors))

    def _initialize_weights(self):
        for f in self.factors:
            # scale = np.sqrt(3.0 / f.shape[3] / f.shape[2])
            scale = 0.7
            nn.init.uniform_(f, -scale, scale)
        # self.factors[-1].data.mul_(np.sqrt(2.0))
        nn.init.constant_(self.bias, 0)

    def adj_shift(self, ths):
        if self.shift.grad is None:
            return
        for i in range(len(self.factors) * 2 + 1):
            if self.shift.grad[i] > 0.9:
                self.shift_adj[i] = max(self.shift_adj[i], 0)
                self.shift_adj[i] += 1
            elif self.shift.grad[i] < 0.3:
                self.shift_adj[i] = min(self.shift_adj[i], 0)
                self.shift_adj[i] -= 1
            else:
                self.shift_adj[i] = 0
        for i in range(len(self.factors) * 2 + 1, len(self.shift)):
            if self.shift.grad[i] > 0.9 * 128:
                self.shift_adj[i] = max(self.shift_adj[i], 0)
                self.shift_adj[i] += 1
            elif self.shift.grad[i] < 0.3 * 128:
                self.shift_adj[i] = min(self.shift_adj[i], 0)
                self.shift_adj[i] -= 1
            else:
                self.shift_adj[i] = 0

        for i in range(len(self.factors)):
            if self.shift_adj[i] > 1:
                self.shift.data[i] += 1
                # print(f'pos {i} increase')
            if self.shift_adj[i] < -1:
                self.shift.data[i] -= 1
                # print(f'pos {i} decrease')

        for i in range(len(self.shift)):
            if self.shift_adj[i] > ths:
                self.shift.data[i] += 1
                # print(f'pos {i} increase')
            if self.shift_adj[i] < -ths:
                self.shift.data[i] -= 1
                # print(f'pos {i} decrease')

        self.shift.grad.zero_()
        pass

    def get_rank_parameters_update(self):
        updates = []
        realrank = self.report_rank()
        realrank = [1] + realrank + [1]

        for i in range(len(self.rank)):

            M = torch.sum(self.factors[i]**2, dim=[0, 1, 2])
            D = self.in_size[i] * self.out_size[i] * realrank[i]
            N = self.in_size[i] * self.out_size[i] * realrank[i] \
                    + self.in_size[i+1] * self.out_size[i+1] * realrank[i+2]
            if self.prior_type == 'log_uniform':
                update = M / (D + 1)
            elif self.prior_type == 'gamma':
                update = (2 * self.prior_para[1] +
                          M) / (D + 2 + 2 * self.prior_para[0])

            elif self.prior_type == 'half_cauchy':
                update = (M - (self.prior_para[0]**2) * D +
                          torch.sqrt(M**2 + (M * self.prior_para[0]**2) *
                                     (2.0 * D + 8.0) + (D**2.0) *
                                     (self.prior_para[0]**4.0))) / (2 * D +
                                                                    4.0)

            elif self.prior_type == 'l2p' or self.prior_type == 'l21_prox' \
                or self.prior_type == 'l21_prox_gamma':

                update = (M, self.out_size[i], N)

            else:
                assert False, 'Unknown prior type'
            updates.append(update)

        return updates

    def update_rank_parameters(self, stepsize=None):
        if self.prior_type is None:
            return

        self.apply_mask()
        with torch.no_grad():
            rank_updates = self.get_rank_parameters_update()
            if self.prior_type == 'l21_prox_gamma' and len(
                    self.prior_para) == 2:
                self.prior_para.append(0)
            for rank_parameter, update, factor in zip(self.rank_parameters,
                                                      rank_updates,
                                                      self.factors[:-1]):

                if self.prior_type == 'l21_prox':
                    M, D, N = update
                    scale = torch.clamp_min(
                        1 - stepsize * self.prior_para[0] * N /
                        (torch.sqrt(M * D)), 0)
                    rank_parameter.data.copy_(M / D * scale**2)
                    factor.data.mul_(scale.view(1, 1, 1, -1))
                elif self.prior_type == 'l21_prox_gamma':
                    M, D, N = update
                    scale = torch.clamp_min(
                        1 - stepsize * self.prior_para[2] * N /
                        (torch.sqrt(M * D)), 0)
                    rank_parameter.data.copy_(M / D * scale**2)

                    factor.data.mul_(scale.view(1, 1, 1, -1))
                elif self.prior_type == 'l2p':
                    rank_parameter.data.copy_(update)
                else:
                    rank_parameter.data.mul_(1 - self.em_stepsize)
                    rank_parameter.data.add_(self.em_stepsize * update)

            if self.prior_type == 'l21_prox_gamma':
                T = 0
                theta = self.prior_para[0]
                k = self.prior_para[1]
                for (M, D, N) in rank_updates:
                    T += torch.sum(N * torch.sqrt(M / D)).item()

                self.prior_para[2] = (k - 1) / (1 / theta + T)

    def get_bayes_loss(self):
        if self.prior_type is None:
            return 0

        loss = 0
        if self.prior_type == 'l21_prox' or self.prior_type == 'l21_prox_gamma':
            return 0
        elif self.prior_type == 'l2p':
            updates = self.get_rank_parameters_update()
            p = self.prior_para[1] if len(self.prior_para) >= 2 else 1.0
            for update in updates:
                loss += torch.sum(torch.sqrt(update)**
                                  p)**(1 / p) * self.prior_para[0]

        else:
            for [factor, rank_parameter] in zip(self.factors[:-1],
                                                self.rank_parameters):
                loss += torch.sum(
                    torch.sum(factor**2, dim=[0, 1, 2]) / rank_parameter)

        return loss

    def update_rank_mask(self, ths=1e-3):
        for (mask, rank_parameter) in zip(self.mask, self.rank_parameters):
            mask.copy_(
                torch.where(rank_parameter > ths,
                            torch.ones_like(rank_parameter),
                            torch.zeros_like(rank_parameter)))

    def apply_mask(self):
        with torch.no_grad():
            for i in range(len(self.rank)):
                self.factors[i].data.mul_(self.mask[i].view(1, 1, 1, -1))
                self.factors[i + 1].data.mul_(self.mask[i].view(-1, 1, 1, 1))

    def report_rank(self):
        return [torch.sum(m).item() for m in self.mask]
Beispiel #8
0
    def collect_now(self, feature_ids, input_shape=None):
        if not self.collecting:
            return

        # Collect old inputs
        if self.in_features is None: # Only process dynamic input models
            new_feature_ids = []
            new_weight_blocs = ParameterList()
            for old_features, weights in zip(self.in_features_map, self.weight_blocks):
                patch = compute_patch(old_features, feature_ids)
                empty = Parameter(torch.zeros(0))
                if len(patch) == 0: # dead block
                    new_feature_ids.append(empty.data.long())
                    new_weights = self.wrap(empty)
                    remove_parameter(self.optimizer, weights)
                else:
                    new_feature_ids.append(old_features[patch])
                    patch = self.wrap(patch)
                    last_dim = 1
                    new_weights = weights.data.transpose(0, last_dim)[patch].transpose(0, last_dim)
                    new_weights = Parameter(new_weights)
                    apply_patch(self.optimizer, weights, patch, (0, last_dim))
                    update_reference(self.optimizer, weights, new_weights)
                new_weight_blocs.append(new_weights)
            self.weight_blocks = new_weight_blocs
            self.in_features_map = new_feature_ids

        # Collect unused features
        if self.out_features is None: # Only process dynamicoutput models
            new_feature_ids = []
            new_weight_blocs = ParameterList()
            new_filters_blocks = ParameterList()
            new_bias_blocks = ParameterList()
            new_jcs = []
            new_pfss = []
            source = zip(self.out_features_map,
                self.weight_blocks, self.filters_blocks,
                self.jump_counts, self.previous_filter_sign)
            for i, (old_features, weights, filter, jc, pfs) in enumerate(source):
                patch = torch.nonzero(torch.abs(filter.data) > EPSILON).squeeze()
                new_bias = None
                if len(patch) == 0 or len(weights) == 0:
                    empty = Parameter(torch.zeros(0))
                    new_weights = empty
                    new_filter = empty
                    new_jc = empty.data.int()
                    new_pfs = empty.data.int()
                    new_feature_ids.append(empty.data.long())
                    remove_parameter(self.optimizer, weights)
                    remove_parameter(self.optimizer, filter)
                    if self.has_bias:
                        new_bias = empty
                        remove_parameter(self.optimizer, self.bias_blocks[i])
                else:
                    new_feature_ids.append(old_features[patch.cpu()])
                    new_weights = Parameter(weights.data[patch])
                    new_filter = Parameter(filter.data[patch])
                    new_jc = jc[patch]
                    new_pfs = pfs[patch]
                    apply_patch(self.optimizer, weights, patch)
                    update_reference(self.optimizer, weights, new_weights)
                    apply_patch(self.optimizer, filter, patch)
                    update_reference(self.optimizer, filter, new_filter)
                    if self.has_bias:
                        bias = self.bias_blocks[i]
                        new_bias = Parameter(bias.data[patch])
                        apply_patch(self.optimizer, bias, patch)
                        update_reference(self.optimizer, bias, new_bias)
                new_weight_blocs.append(new_weights)
                new_jcs.append(new_jc)
                new_pfss.append(new_pfs)
                new_filters_blocks.append(new_filter)
                if new_bias is not None:
                    new_bias_blocks.append(new_bias)

            self.out_features_map = new_feature_ids
            self.filters_blocks = new_filters_blocks
            self.weight_blocks = new_weight_blocs
            self.jump_counts = new_jcs
            self.previous_filter_sign = new_pfss
            if hasattr(self, 'bias_blocks'):
                self.bias_blocks = new_bias_blocks

        self.regenerate_out_feature_ids()
        self.collecting = False
class KalmanFilter(torch.nn.Module):
    def __init__(self,
                 measures: Sequence[str],
                 processes: Sequence[Process],
                 device: Optional[torch.device] = None,
                 **kwargs):

        super().__init__()
        self.design: Design = None
        self._init_design(measures=measures,
                          processes=processes,
                          device=device,
                          **kwargs)

        # parameters from design:
        self.design_parameters = ParameterList()
        for param in self.design.parameters():
            self.design_parameters.append(param)

        # the StateBelief family, implemented by property (default gaussian)
        self._family = None

        self.to(device=self.design.device)

    def _init_design(self, *args, **kwargs) -> None:
        self.design = Design(*args, **kwargs)

    @property
    def measure_size(self) -> int:
        return self.design.measure_size

    @property
    def family(self) -> TypeVar('Gaussian'):
        if self._family is None:
            self._family = Gaussian
        return self._family

    def predict_initial_state(self,
                              design_for_batch: DesignForBatch) -> 'Gaussian':
        return self.family(
            means=design_for_batch.initial_mean,
            covs=design_for_batch.initial_covariance,
            # we consider this a one-step-ahead prediction, so last measured one step ago:
            last_measured=torch.ones(design_for_batch.num_groups,
                                     dtype=torch.int))

    def design_for_batch(self, num_groups: int, num_timesteps: int,
                         **kwargs) -> DesignForBatch:
        return self.design.for_batch(num_groups=num_groups,
                                     num_timesteps=num_timesteps,
                                     **kwargs)

    # noinspection PyShadowingBuiltins
    def forward(self,
                input: Tensor,
                initial_state: Optional[StateBelief] = None,
                progress: Union[tqdm, bool] = False,
                **kwargs) -> StateBeliefOverTime:
        """
        :param input: The multivariate time-series to be fit by the kalman-filter. A Tensor where the first dimension
        represents the groups, the second dimension represents the time-points, and the third dimension represents the
        measures.
        :param initial_state: If a StateBelief, this is used as the prediction for time=0; if None then each process
        generates initial values.
        :param progress: Should progress-bar be generated?
        :param kwargs: Other kwargs that will be passed to the `design_for_batch` method.
        :return: A StateBeliefOverTime consisting of one-step-ahead predictions.
        """

        num_groups, num_timesteps, num_measures = input.shape
        if num_measures != self.measure_size:
            raise ValueError(
                f"This KalmanFilter has {self.measure_size} measurement-dimensions; but the input shape is "
                f"{(num_groups, num_timesteps, num_measures)} (last dim should == measure-size)."
            )

        design_for_batch = self.design_for_batch(num_groups=num_groups,
                                                 num_timesteps=num_timesteps,
                                                 **kwargs)

        # initial state of the system:
        if initial_state is None:
            state_prediction = self.predict_initial_state(design_for_batch)
        else:
            state_prediction = initial_state

        progress = progress or identity
        if progress is True:
            progress = tqdm
        iterator = progress(range(num_timesteps))

        # generate one-step-ahead predictions:
        state_predictions = []
        for t in iterator:
            if t > 0:
                # take state-prediction of previous t (now t-1), correct it according to what was actually measured at at t-1
                state_belief = state_prediction.update(obs=input[:, t - 1, :])

                # predict the state for t, from information from t-1
                # F at t-1 is transition *from* t-1 *to* t
                F = design_for_batch.F(t - 1)
                Q = design_for_batch.Q(t - 1)
                state_prediction = state_belief.predict(F=F, Q=Q)

            # compute how state-prediction at t translates into measurement-prediction at t
            H = design_for_batch.H(t)
            R = design_for_batch.R(t)
            state_prediction.compute_measurement(H=H, R=R)

            # append to output:
            state_predictions.append(state_prediction)

        return self.family.concatenate_over_time(
            state_beliefs=state_predictions, design=self.design)

    def smooth(self, states: StateBeliefOverTime):
        raise NotImplementedError

    def simulate(self,
                 states: Union[StateBeliefOverTime, StateBelief],
                 horizon: int,
                 num_iter: int,
                 progress: bool = False,
                 from_times: Sequence[int] = None,
                 state_to_measured: Optional[Callable] = None,
                 white_noise: Optional[Tuple[Tensor, Tensor]] = None,
                 ntry_diag_incr: int = 1000,
                 **kwargs) -> List[Tensor]:

        assert horizon > 0

        # forecast-from time:
        if from_times is None:
            if isinstance(states, StateBelief):
                initial_state = states
            else:
                # a StateBeliefOverTime was passed, but no from_times, so just pick the last one
                initial_state = states.last_prediction()
        else:
            # from_times will be used to pick the slice
            initial_state = states.get_state_belief(from_times)

        initial_state = initial_state.__class__(
            means=initial_state.means.repeat((num_iter, 1)),
            covs=initial_state.covs.repeat((num_iter, 1, 1)),
            last_measured=initial_state.last_measured.repeat(num_iter))

        design_for_batch = self.design_for_batch(
            num_groups=initial_state.num_groups,
            num_timesteps=horizon,
            **kwargs)

        if white_noise is None:
            process_wn, measure_wn = None, None
        else:
            process_wn, measure_wn = white_noise
        trajectories = initial_state.simulate_state_trajectories(
            design_for_batch=design_for_batch,
            progress=progress,
            ntry_diag_incr=ntry_diag_incr,
            eps=process_wn)
        if state_to_measured is None:
            sim = trajectories.measurement_distribution.deterministic_sample(
                eps=measure_wn)
        else:
            sim = state_to_measured(trajectories)

        return torch.chunk(sim, num_iter)

    def forecast(self,
                 states: Union[StateBeliefOverTime, StateBelief],
                 horizon: int,
                 from_times: Optional[Sequence[int]] = None,
                 progress: bool = False,
                 **kwargs) -> StateBeliefOverTime:

        assert horizon > 0

        # forecast-from time:
        if from_times is None:
            if isinstance(states, StateBelief):
                state_prediction = states
            else:
                # a StateBeliefOverTime was passed, but no from_times, so just pick the last one
                state_prediction = states.last_prediction()
        else:
            # from_times will be used to pick the slice
            state_prediction = states.get_state_belief(from_times)

        design_for_batch = self.design_for_batch(
            num_groups=state_prediction.num_groups,
            num_timesteps=horizon,
            **kwargs)

        progress = progress or identity
        if progress is True:
            progress = tqdm
        iterator = progress(range(design_for_batch.num_timesteps))

        forecasts = []
        for t in iterator:
            if t > 0:
                # predict the state for t, from information from t-1
                # F at t-1 is transition *from* t-1 *to* t
                F = design_for_batch.F(t - 1)
                Q = design_for_batch.Q(t - 1)
                state_prediction = state_prediction.predict(F=F, Q=Q)

            # compute how state-prediction at t translates into measurement-prediction at t
            H = design_for_batch.H(t)
            R = design_for_batch.R(t)
            state_prediction.compute_measurement(H=H, R=R)

            # append to output:
            forecasts.append(state_prediction)

        return self.family.concatenate_over_time(state_beliefs=forecasts,
                                                 design=self.design)
Beispiel #10
0
class MultiInputLayer(InputLayer):
    metadata: Metadata
    has_categorical: bool
    output_size: int
    embeddings: ParameterList
    embedding_by_variable: Dict[str, Parameter]

    def __init__(self,
                 metadata: Metadata,
                 min_embedding_size: int = 2,
                 max_embedding_size: int = 50) -> None:
        super(MultiInputLayer, self).__init__()

        self.metadata = metadata

        self.has_categorical = False
        self.output_size = 0

        # our embeddings need to be referenced like this to be considered in the parameters of this model
        self.embeddings = ParameterList()
        # this reference is for using the embeddings during the forward pass
        self.embedding_by_variable = {}

        for i, variable_metadata in enumerate(
                self.metadata.get_by_independent_variable()):
            # if it is a numerical variable
            if variable_metadata.is_binary() or variable_metadata.is_numerical(
            ):
                assert variable_metadata.get_size() == 1
                self.output_size += 1

            # if it is a categorical variable
            elif variable_metadata.is_categorical():
                variable_size = variable_metadata.get_size()

                # this is an arbitrary rule of thumb taken from several blog posts
                embedding_size = compute_embedding_size(
                    variable_size, min_embedding_size, max_embedding_size)

                # the embedding is implemented manually to be able to use one hot encoding
                # PyTorch embedding only accepts as input label encoding
                embedding = Parameter(data=torch.Tensor(
                    variable_size, embedding_size).normal_(),
                                      requires_grad=True)

                self.embeddings.append(embedding)
                self.embedding_by_variable[
                    variable_metadata.get_name()] = embedding

                self.output_size += embedding_size
                self.has_categorical = True

            # if it is another type
            else:
                raise Exception(
                    "Unexpected variable type '{}' for variable '{}'.".format(
                        variable_metadata.get_type(),
                        variable_metadata.get_name()))

    def forward(self, inputs: Tensor, **additional_inputs: Tensor) -> Tensor:
        # this is a "leaf" input layer (no child input layers)
        # so no additional inputs should remain
        for additional_inputs_name, additional_inputs_value in additional_inputs.items(
        ):
            if additional_inputs_value is not None:  # sometimes it makes things easier if I pass None
                raise Exception(
                    "Unexpected additional inputs received: {}.".format(
                        additional_inputs_name))

        if self.has_categorical:
            outputs = []
            start = 0
            for variable_metadata in self.metadata.get_by_independent_variable(
            ):
                # extract the variable
                end = start + variable_metadata.get_size()
                variable = inputs[:, start:end]

                # if it is a binary or numerical variable leave the input as it is
                if variable_metadata.is_binary(
                ) or variable_metadata.is_numerical():
                    outputs.append(variable)
                # if it is a categorical variable use the embedding
                elif variable_metadata.is_categorical():
                    embedding = self.embedding_by_variable[
                        variable_metadata.get_name()]
                    output = torch.matmul(variable, embedding).squeeze(1)
                    outputs.append(output)
                # it should never get to this part
                else:
                    raise Exception(
                        "Unexpected variable type '{}' for variable '{}'.".
                        format(variable_metadata.get_type(),
                               variable_metadata.get_name()))

                # move the variable limits
                start = end

            # concatenate all the variable outputs
            return torch.cat(outputs, dim=1)
        else:
            return inputs

    def get_output_size(self) -> int:
        return self.output_size
Beispiel #11
0
class WeightedDynamicModule(DynamicModule):

    def __init__(
        self,
        in_features=None,
        out_features=None,
        weight_allocation=(),
        weight_initializer=default_initializer,
        bias_initializer=default_initializer,
        reuse_features=True,
        bias=True,
        k=0
    ):
        super(WeightedDynamicModule, self).__init__()

        # Saving parameters
        self.in_features = in_features
        self.out_features = out_features
        self.weight_allocation = weight_allocation
        self.weight_initializer = weight_initializer
        self.bias_initializer = bias_initializer
        self.reuse_features = reuse_features
        self.has_bias = bias
        self.k = k

        if self.out_features is not None:
            # We have a fixed number of features out so we precompute it
            self._out_feature_ids = torch.arange(0, self.out_features).long()
        else:  # We will need to define it later
            self._out_feature_ids = None

        if self.in_features is not None:
            self._in_feature_ids = torch.arange(0, self.in_features).long()
        else:
            self._in_feature_ids = None

        # Weight Parameters
        self.weight_blocks = ParameterList()

        # Bias parameters
        if self.has_bias:
            self.bias_blocks = ParameterList()

        # Filter Parameter
        if self.out_features is None:
            self.filters_blocks = ParameterList()
            self.jump_counts = ModuleList()
            self.previous_filter_sign = []

        # Device information
        self.device_id = -1 # CPU

    def set_device_id(self, device_id):
        self.device_id = device_id

    def wrap(self, tensor):
        if self.device_id == -1:
            return tensor.cpu()
        else:
            return tensor.cuda(self.device_id)

    def regenerate_out_feature_ids(self):
        if self.out_features is None:
            non_empty_features = [x for x in self.out_features_map if len(x) > 0]
            if len(non_empty_features) == 0:
                self._out_feature_ids = self.wrap(torch.zeros(0).long())
            else:
                self._out_feature_ids = torch.cat(non_empty_features)

    def block_parameters(self, block_id=-1):  # By default the last block
        result = []
        if len(self.weight_blocks) == 0:
            return result
        result.append(self.weight_blocks[block_id])
        if hasattr(self,'bias_blocks'):
            result.append(self.bias_blocks[block_id])
        if hasattr(self, 'filters_blocks'):
            result.append(self.filters_blocks[block_id])
        return (x for x in result if len(x) > 0)  # remove dead parameters

    def grow_now(self, feature_ids, input_shape=None):
        # Compute the number of rows (outputs) in the weight matrix
        if self.out_features is None:
            rows = self.growing
            out_feature_ids = torch.arange(self.max_feature, self.max_feature + rows)
            self.out_features_map.append(out_feature_ids)
            self.max_feature += rows
        else:
            rows = self.out_features

        # Compute the number of columns (inputs) in the weight matrix
        if self.in_features is not None:
            cols = self.in_features
        else:
            if self.reuse_features or len(self.in_feature_maps) == 0:
                new_features = feature_ids.clone()
            else:
                m = self.in_feature_maps[-1].max()
                new_features = feature_ids[feature_ids > m]
            cols = len(new_features)
            self.in_features_map.append(new_features)

        weights = self.wrap(self.weight_initializer(
            torch.zeros((rows, cols) + self.weight_allocation)
        ))
        self.weight_blocks.append(Parameter(weights))

        if hasattr(self, 'bias_blocks'):
            bias = self.wrap(self.bias_initializer(torch.zeros(rows)))
            self.bias_blocks.append(Parameter(bias))

        if hasattr(self, 'filters_blocks'):
            filter = self.wrap(torch.ones(rows))
            self.filters_blocks.append(Parameter(filter))
            self.jump_counts.append(self.wrap(nn.BatchNorm1d(rows)))
            self.previous_filter_sign.append(self.wrap(torch.ones(rows).int()))

        if self.out_features is None:
            new_feature_ids = []

        self.growing = False
        self.regenerate_out_feature_ids()

    def collect_now(self, feature_ids, input_shape=None):
        if not self.collecting:
            return

        # Collect old inputs
        if self.in_features is None: # Only process dynamic input models
            new_feature_ids = []
            new_weight_blocs = ParameterList()
            for old_features, weights in zip(self.in_features_map, self.weight_blocks):
                patch = compute_patch(old_features, feature_ids)
                empty = Parameter(torch.zeros(0))
                if len(patch) == 0: # dead block
                    new_feature_ids.append(empty.data.long())
                    new_weights = self.wrap(empty)
                    remove_parameter(self.optimizer, weights)
                else:
                    new_feature_ids.append(old_features[patch])
                    patch = self.wrap(patch)
                    last_dim = 1
                    new_weights = weights.data.transpose(0, last_dim)[patch].transpose(0, last_dim)
                    new_weights = Parameter(new_weights)
                    apply_patch(self.optimizer, weights, patch, (0, last_dim))
                    update_reference(self.optimizer, weights, new_weights)
                new_weight_blocs.append(new_weights)
            self.weight_blocks = new_weight_blocs
            self.in_features_map = new_feature_ids

        # Collect unused features
        if self.out_features is None: # Only process dynamicoutput models
            new_feature_ids = []
            new_weight_blocs = ParameterList()
            new_filters_blocks = ParameterList()
            new_bias_blocks = ParameterList()
            new_jcs = []
            new_pfss = []
            source = zip(self.out_features_map,
                self.weight_blocks, self.filters_blocks,
                self.jump_counts, self.previous_filter_sign)
            for i, (old_features, weights, filter, jc, pfs) in enumerate(source):
                patch = torch.nonzero(torch.abs(filter.data) > EPSILON).squeeze()
                new_bias = None
                if len(patch) == 0 or len(weights) == 0:
                    empty = Parameter(torch.zeros(0))
                    new_weights = empty
                    new_filter = empty
                    new_jc = empty.data.int()
                    new_pfs = empty.data.int()
                    new_feature_ids.append(empty.data.long())
                    remove_parameter(self.optimizer, weights)
                    remove_parameter(self.optimizer, filter)
                    if self.has_bias:
                        new_bias = empty
                        remove_parameter(self.optimizer, self.bias_blocks[i])
                else:
                    new_feature_ids.append(old_features[patch.cpu()])
                    new_weights = Parameter(weights.data[patch])
                    new_filter = Parameter(filter.data[patch])
                    new_jc = jc[patch]
                    new_pfs = pfs[patch]
                    apply_patch(self.optimizer, weights, patch)
                    update_reference(self.optimizer, weights, new_weights)
                    apply_patch(self.optimizer, filter, patch)
                    update_reference(self.optimizer, filter, new_filter)
                    if self.has_bias:
                        bias = self.bias_blocks[i]
                        new_bias = Parameter(bias.data[patch])
                        apply_patch(self.optimizer, bias, patch)
                        update_reference(self.optimizer, bias, new_bias)
                new_weight_blocs.append(new_weights)
                new_jcs.append(new_jc)
                new_pfss.append(new_pfs)
                new_filters_blocks.append(new_filter)
                if new_bias is not None:
                    new_bias_blocks.append(new_bias)

            self.out_features_map = new_feature_ids
            self.filters_blocks = new_filters_blocks
            self.weight_blocks = new_weight_blocs
            self.jump_counts = new_jcs
            self.previous_filter_sign = new_pfss
            if hasattr(self, 'bias_blocks'):
                self.bias_blocks = new_bias_blocks

        self.regenerate_out_feature_ids()
        self.collecting = False

    def compute(self, x):
        if len(self.weight_blocks) == 0:
            raise AssertionError('Empty Model, call model.grow(size)')

        # Generate input tensors
        inputs = []
        if len(self.in_features_map) == 0:
            inputs = [x] * len(self.weight_blocks)
        else:
            start_idx = 0
            for features in self.in_features_map:
                if len(features) == 0:
                    inputs.append(self.wrap(torch.zeros(0)))
                else:
                    end_idx = start_idx + len(features)
                    inputs.append(x[:, start_idx:end_idx])
                    if not self.reuse_features:
                        start_idx += len(features)

        # Process the inputs
        results = []
        for i, (inp, weights) in enumerate(zip(inputs, self.weight_blocks)):
            bias = None
            if self.has_bias:
                bias = self.bias_blocks[i]
            if len(weights) != 0:
                result = self.compute_block(inp, weights, bias)
                if hasattr(self, 'filters_blocks'): # Apply filter if needed
                    filter = self.filters_blocks[i]
                    bn = self.jump_counts[i]
                    last_dim = len(result.size()) - 1
                    result = bn(result)
                    result = result.transpose(1, last_dim)
                    result = result * filter
                    result = result.transpose(1, last_dim)
                results.append(result)

        # Merge the results properly
        if len(results) == 0:
            raise AssertionError('Empty Model, call model.grow(size)')
        elif self.out_features is None:
            return torch.cat(results, dim=1)
        else:
            return torch.stack(results, dim=0).sum(0)

    def compute_block(self, x, weights, bias=None):
        raise NotImplementedError('Should be implemented by subclasses')

    def loss_factor(self):
        return float(np.array(self.weight_allocation).prod())

    @property
    def individual_filters(self):
        return [x * Variable(torch.abs(x.data) > EPSILON).float() for x in self.filters_blocks if len(x) > 0]

    def full_filter(self):
        individual_filters = self.individual_filters
        if len(individual_filters) == 0:
            return Variable(self.wrap(torch.zeros(0)))
        return torch.cat(individual_filters)

    def l1_loss(self, last_block=False):
        if hasattr(self, 'filters_blocks') and len(self.filters_blocks) > 0:
            if last_block:
                filter = self.filters_blocks[-1]
            else:
                filter = self.full_filter()
            return torch.abs(filter).sum() * self.loss_factor()
        return Variable(self.wrap(torch.zeros(1)), requires_grad=False)

    @property
    def block_features(self):
        return [(x.data > 0).long().sum() for x in self.individual_filters]

    @property
    def used_neurons(self):
        jc = torch.cat(self.jump_counts)
        return (torch.pow(float(self.k), jc.float()) > 1e-3).sum()

    @property
    def num_output_features(self):
        if self.out_features is not None:
            return self.out_features
        if len(self.filters_blocks) == 0:
            return 0
        return (self.full_filter().data >= 0).long().sum()


    @property
    def num_input_features(self):
        if self.in_features is not None:
            return self.in_features
        if len(self.in_features_map) == 0:
            return 0
        return len(set().union(*(set(x.numpy()) for x in self.in_features_map)))

    def generate_input(self):
        if self.in_features is None:
            raise ValueError('fake pass needs input or in_feature defined')
        size = (5, self.in_features) + self.additional_dims
        x = torch.rand(*size)
        x = torch.autograd.Variable(x, requires_grad=False)
        return self.wrap(x)

    @property
    def current_dimension_repr(self):
        return " [%s -> %s]" % (self.num_input_features, self.num_output_features)
Beispiel #12
0
class Hbnn(nn.Module):
    def __init__(self, out_cls=10):
        # these are two useless prior
        super(Hbnn, self).__init__()
        self.prior_v = 100
        self.prior_tau_0_reciprocal = 1000
        self.num_net = 0
        self.out_cls = out_cls
        self.w0 = Net(out_cls)  # this is the network of w0
        self.hbnn = ModuleList()  # this is the network of all the classes
        self.mu_gamma_g = ParameterList()
        self.sigma_gamma_g = ParameterList()
        self.mu_gamma = Parameter(torch.ones(1))
        self.sigma_gamma = Parameter(torch.ones(1))

        if torch.cuda.is_available():
            self.w0 = self.w0.cuda()

    def params(self):
        """self implemented method to check the number of all the parameters"""
        parameters = list()
        [parameters.append(i) for i in self.w0.params()]
        [parameters.append(i) for i in self.mu_gamma_g]
        [parameters.append(i) for i in self.sigma_gamma_g]
        for j in self.hbnn:
            [parameters.append(i) for i in j.params()]

        return parameters

    def params_net(self, num):
        """self implemented method to check the number of parameters of single class"""
        parameters = list()
        parameters.append(self.mu_gamma_g[num])
        parameters.append(self.sigma_gamma_g[num])
        [parameters.append(i) for i in self.hbnn[num].params()]
        [parameters.append(i) for i in self.w0.params()]

        return parameters

    def resume_cuda(self):
        """make every parameter on cuda after if_resume"""
        for p in self.mu_gamma_g:
            p = p.cuda()
        for p in self.sigma_gamma_g:
            p = p.cuda()

    def addnet(self):
        self.hbnn.append(Net(self.out_cls).cuda())
        self.num_net += 1
        self.mu_gamma = Parameter(torch.ones(1))
        self.sigma_gamma = Parameter(torch.ones(1))
        self.mu_gamma_g.append(self.mu_gamma)
        self.sigma_gamma_g.append(self.sigma_gamma)

    def forward_single_net(self, x, num, mc_times=100):
        """forward through the current net"""
        net = self.hbnn[num]
        output = 0
        for i in range(mc_times):
            out = net.forward(x)
            output += out
        return output / mc_times

    def forward(self, x, num, mc_times=100):
        """forward through the whole net"""
        # assert (self.num_net - 1 == num), 'the number of testing classes is {0}, while the net has {1}'.format(num, self.num_net)
        output = self.forward_single_net(x, num=0, mc_times=mc_times)
        # if num != 0:
        #     output = output[:, 0].unsqueeze(1)
        # output = self.forward_single_net(x, num=0, mc_times=mc_times).t()[0, :]
        for n in range(1, num + 1):
            # _ = self.forward_single_net(x, num=num, mc_times=mc_times).t()[0, :]
            _ = self.forward_single_net(x, num=n, mc_times=mc_times)
            # _ = _.view(1, -1)
            # _ = _[:, 0].unsqueeze(1)
            # if num == 1:
            #     output = torch.stack((output, _))
            # else:
            #     _ = _.unsqueeze(0)
            #     output = torch.cat((output, _), dim=0)
            output = torch.cat((output, _), dim=1)
        return output

    def elbo(self,
             x,
             y,
             batchs,
             train_net,
             minus=True,
             factor=2,
             a=10e1,
             b=10e1,
             c=10e2,
             mc_times=100,
             if_debug=False):
        """this calculates the elbo of current network, which is unrelated to the data"""
        d = self._data_term(
            x, y, train_net=train_net, factor=factor, mc_times=mc_times) * a
        # d.backward()
        e = self._ent_term(num=train_net) * b
        # e.backward()
        c = self._cross_ent_term(num=train_net) * c
        if_debug = False
        if if_debug:
            print('data :{0}\t entro :{1}\t cross :{2}\t').format(
                d.data.cpu().numpy()[0],
                e.data.cpu().numpy()[0],
                c.data.cpu().numpy()[0])
            # print('d is: {0}').format(d.data.cpu().numpy())
            # print('e is: {0}').format(e.data.cpu().numpy())
            # print('c is: {0}').format(c.data.cpu().numpy())
        # c.backward()
        elbo = d + 1.0 / batchs * (c + e)
        # elbo = elbo * 100
        if minus:
            return -elbo, -d, -e, -c
        else:
            return elbo, d, e, c
        # return d + 1.0 / batchs * (e)
        # return d + 1.0 / batchs * (c)
        # return c

    def _data_term(self,
                   x,
                   y,
                   train_net,
                   factor=1,
                   mc_times=100,
                   if_debug=False):
        # Fixed: the inner_product is not right here at the moment, have to process the target
        inner_product = 0
        num_weights = self.w0.nelements
        net = self.hbnn[train_net]
        # process the target data
        col = y.cpu().data.numpy()
        row = np.arange(len(col))
        target = np.zeros((y.size(0), self.out_cls))
        target[row, col] = 1

        # target = np.zeros((y.size(0), self.out_cls))
        # target[:, col] = 1
        target = Variable(torch.Tensor(target)).cuda()

        for i in range(mc_times):
            output = net.forward(x)
            if if_debug:
                print('y is {0}'.format(y.cpu().data.numpy()))
            batch_size = y.size(0)
            inner_product += (output * target).sum()
            # len = y.data.shape[0]
            # index = -1 * y.eq(self.num_net - 1).long() + 1
            # index = index.data.cpu().numpy().reshape(len, )
            # y_reshape = np.zeros((len, 2))
            # update_values = np.ones((len))
            # y_reshape[np.arange(0, len), index] = update_values
            # y_reshape[:, 0] = y_reshape[:, 0] * factor
            # target = Variable(torch.Tensor(y_reshape)).cuda()
            # inner_product += (output * target).sum()
            # inner_product += output.gather(1, y.view(-1, 1)).sum()
            # correct = pred.eq(y.view(-1, 1).expand_as(pred)).float()
            # if if_debug:
            #     print('output is {0}'.format(output.cpu().data.numpy()))
            #     print('pred is {0}'.format(pred.t().cpu().data.numpy()))
            #     print('prob is {0}'.format(prob.t().cpu().data.numpy()))
            #     print('y is {0}'.format(y.cpu().data.numpy()))
            #     print('correct is {0}'.format(correct.t().cpu().data.numpy()))
            #     print('inner product is {0}'.format((prob * correct).sum()))
            # inner_product += (prob * correct).sum()

        # return inner_product / mc_times / num_weights
        return inner_product / mc_times

    def _cross_ent_term(self,
                        num,
                        batch_size=64,
                        feature_size=456,
                        mc_times=1000,
                        if_debug=False):
        """Here er've got monte carlo for gamma_g"""
        num_weights = self.w0.nelements
        _1 = - 0.5 * 1 / self.prior_tau_0_reciprocal * (self.w0.mu_square().sum() + self.w0.sigma_square().sum()) \
             - 0.5 * num_weights * math.log(self.prior_tau_0_reciprocal) - 0.5 * num_weights * math.log(2 * math.pi)
        _1 = _1 / num_weights
        _2 = 0
        _3 = 0
        for g in range(num + 1):
            net = self.hbnn[g]
            epsilon_mc = self.mu_gamma_g[g] + log(
                1 + torch.exp(self.sigma_gamma_g[g])) * Variable(
                    torch.randn(mc_times).cuda(), requires_grad=False)
            epsilon_mc = log(epsilon_mc**2).mean()
            _4 = -0.5 * net.nelements * epsilon_mc
            _4 = _4 / num_weights
            # _4.backward()
            _5 = -0.5 * (self.mu_gamma_g[g].pow(2) +
                         log(1 + torch.exp(self.sigma_gamma_g[g])).pow(2))
            _5 = _5 / num_weights
            # _5.backward()
            # _6 = net.mu_square + net.sigma_square + self.w0.mu_square + self.w0.sigma_square - 0.5 * net.nelements * CONS
            _6 = net.mu_square() + net.sigma_square() + self.w0.mu_square(
            ) + self.w0.sigma_square()
            _6 = _6 / num_weights
            # _6.backward()
            _7 = - 2 * (torch.sum(net.fc1.mu * self.w0.fc1.mu) +
                        torch.sum(net.fc3.mu * self.w0.fc3.mu)) \
                # +  torch.sum(net.fc2.mu * self.w0.fc2.mu)

            # _7.backward()
            _7 = _7 / num_weights
            _9 = -0.5 * net.nelements * CONS
            _9 = _9 / num_weights
            _8 = _5 * (_6 + _7) * num_weights
            # _9 = _4 + _8
            _2 = _2 + _4 + _8 + _9
            # _2 = _2 + (
            #     - 0.5 * net.nelements * torch.mean(log(epsilon_mc.pow(2)))
            #     - 0.5 * (self.mu_gamma_g[g].pow(2) + self.sigma_gamma_g[g].pow(2))
            #     * (
            #         net.mu_square() + net.sigma_square() + self.w0.mu_square() + self.w0.sigma_square() \
            #         - 0.5 * net.nelements * CONS - 2 * (torch.sum(net.fc1.mu * self.w0.fc1.mu) + \
            #                                             torch.sum(net.fc2.mu * self.w0.fc2.mu) + \
            #                                             torch.sum(net.fc3.mu * self.w0.fc3.mu))
            #     )
            # )
            _3 = _3 - 0.5 * CONS - 0.5 * math.log(
                self.prior_v) - 0.5 * 1.0 / self.prior_v * (
                    self.mu_gamma_g[g].pow(2) + log(
                        (1 + torch.exp(self.sigma_gamma_g[g]).pow(2))))
            _3 = _3 / num_weights

        # if_debug = True
        if if_debug == True:
            print('_1 is {0}'.format(_1.cpu().data.numpy()))
            print('_2 is {0}'.format(_2.cpu().data.numpy()))
            print('_3 is {0}'.format(_3.cpu().data.numpy()))
            print('_4 is {0}'.format(_4.cpu().data.numpy()))
            print('_5 is {0}'.format(_5.cpu().data.numpy()))
            print('_6 is {0}'.format(_6.cpu().data.numpy()))
            print('_7 is {0}'.format(_7.cpu().data.numpy()))
            print('_8 is {0}'.format(_8.cpu().data.numpy()))
            # print('_9 is {0}'.format(_9.cpu().data.numpy()))
            print('_9 is {0}'.format(_9))
        return _1 + (_2 + _3) / (num + 1)
        # return (_1 + _2 + _3) * num_weights

    def _ent_term(self, num, if_debug=False):
        num_weights = self.w0.nelements
        _1 = 0.5 * num_weights * (CONS + 1) + log(self.w0.fc1.sigma_repara()).sum() + log(
            self.w0.fc3.sigma_repara()).sum() \
            # + log(self.w0.fc3.sigma_repara()).sum()

        _1 = _1 / num_weights
        _2 = 0
        _3 = 0
        for g in range(num + 1):
            net = self.hbnn[g]
            _2 = _2 + ((0.5 * net.nelements *
                        (CONS + 1)) + log(net.fc1.sigma_repara()).sum() +
                       log(net.fc3.sigma_repara()).sum()) / num_weights
            _3 = _3 + (0.5 * (CONS + 1)) + log(
                log(1 + torch.exp(self.sigma_gamma_g[g]))) / num_weights

        # if_debug = False
        if if_debug == True:
            print('_1 is {0}'.format(_1.cpu().data.numpy()))
            print('_2 is {0}'.format(_2.cpu().data.numpy()))
            print('_3 is {0}'.format(_3.cpu().data.numpy()))
        return _1 + (_2 + _3) / (num + 1)
        # return (_1 + _2 + _3) * num_weights

    def reset(self, net):
        network = self.hbnn[net]
        network.reset()
Beispiel #13
0
class DMM(Module):
    """Dirichlet Mixture Model

    Parameters
    ==========
    dim : ``int``
        Dimension of the observed data.
    n_components : ``int``
        Number of mixture components.
    """
    def __init__(self, dim: int, n_components: int) -> None:
        super(DMM, self).__init__()
        self._dim = dim
        self._n_components = n_components

        mixture_logits = torch.zeros((n_components, ), dtype=torch.float)
        self.mixture_logits = Parameter(mixture_logits)

        self.log_alphas = ParameterList()
        for _ in range(n_components):
            log_alpha = Parameter(torch.randn(dim, dtype=torch.float) / 3)
            self.log_alphas.append(log_alpha)

    @overrides
    def forward(
            self,  # pylint: disable=arguments-differ
            observed_data: torch.FloatTensor):
        """Computes the expected value of the log-likelihood function (e.g. the E-step)

        Parameters
        ==========
        observed_data : ``torch.FloatTensor(size=(batch_size, dim))``
            The observed data.

        Returns
        =======
        nll : ``torch.FloatTensor(size=(batch_size,))``
            The negative likelihood of the observed data.
        membership_probs : ``torch.FloatTensor(size=(batch_size, n_components))``
            The membership probabilities.
        """
        batch_size = observed_data.size()[0]

        # Convert mixture logits to log probabilities
        prior_log_probs = F.log_softmax(self.mixture_logits, dim=-1)

        # Compute membership probabilities.
        # NOTE: Need to use no_grad() here to prevent torch from trying to differentiate through
        # this step.
        membership_log_probs = torch.empty(size=(batch_size,
                                                 self._n_components),
                                           requires_grad=False)
        for i in range(self._n_components):
            membership_log_probs[:, i] = prior_log_probs[i] + log_p(
                observed_data, self.log_alphas[i])
        denom = torch.logsumexp(membership_log_probs, dim=1)
        denom = denom.unsqueeze(1)
        membership_log_probs -= denom
        # Need to detach since gradient does not propagate through membership probabilities in EM.
        membership_probs = membership_log_probs.exp().detach()

        # Compute expected negative log-likelihood w.r.t membership probabilities
        nll = torch.empty(size=(batch_size, ), requires_grad=False)
        for i in range(self._n_components):
            log_likelihood = log_p(observed_data,
                                   self.log_alphas[i]) + prior_log_probs[i]
            nll[:, ] -= membership_probs[:, i] * log_likelihood

        return nll, membership_probs
Beispiel #14
0
class ActionHelper(Module):
    """
    完成一些Enum缺少的功能。
    """

    valid_actions = {
        # 如果上一步生成了predicate,则下一步判断是否为左右arc
        Action.PRED_GEN: [
            Action.LEFT_ARC, Action.NO_LEFT_ARC, Action.RIGHT_ARC,
            Action.NO_RIGHT_ARC
        ],

        # 如果上一步是shift,则下一步只可判断是否为predicate
        Action.SHIFT: [Action.PRED_GEN, Action.NO_PRED],
    }

    def __init__(self):
        super().__init__()
        directional_actions = [
            Action.LEFT_ARC, Action.NO_LEFT_ARC, Action.RIGHT_ARC,
            Action.NO_RIGHT_ARC
        ]
        directional_and_shift = directional_actions + [Action.SHIFT]
        # 如果上一步是方向的,下一步可以继续判断,或者转移
        for action in directional_actions:
            self.valid_actions[action] = directional_and_shift

        common_actions = [
            Action.PRED_GEN, Action.NO_PRED, Action.NO_LEFT_ARC,
            Action.NO_RIGHT_ARC
        ]
        # 如果上一步是NO-PRED,则按左右栈是否空来给定
        # 0. 左右都有
        self.valid_actions[(False, False)] = common_actions
        # 1. 左空右不,
        self.valid_actions[(True, False)] = [Action.RIGHT_ARC] + common_actions
        # 2. 左不右空
        self.valid_actions[(False, True)] = [Action.LEFT_ARC] + common_actions
        # 3. 左右皆空
        self.valid_actions[(True, True)] = [Action.SHIFT]

        self.masks = ParameterList()
        self.key_to_id = dict()
        for k, v in self.valid_actions.items():
            values = set(a.value for a in v)
            self.key_to_id[k] = len(self.masks)
            self.masks.append(
                Parameter(torch.tensor(
                    [1 if i in values else 0 for i in range(len(Action))]),
                          requires_grad=False))

    def get_valid_actions(
            self,
            action: Action = None,
            empty_left: bool = True,
            empty_right: bool = True) -> Tuple[List[Action], Tensor]:
        """
        Return valid actions list by previous action.
        """
        if action in self.valid_actions:
            return self.valid_actions[action], self.masks[
                self.key_to_id[action]]
        else:
            key = (empty_left, empty_right)
            return self.valid_actions[key], self.masks[self.key_to_id[key]]

    @staticmethod
    def make_oracle(length: int,
                    relations: Dict[int, Dict[int, str]]) -> List[Action]:
        actions = [Action.SHIFT]

        for i in range(length):
            if i in relations:
                actions.append(Action.PRED_GEN)
                for j in range(1, max(i, length - i) + 1):
                    left = i - j
                    if left >= 0:
                        if left in relations[i]:
                            actions.append(Action.LEFT_ARC)
                        else:
                            actions.append(Action.NO_LEFT_ARC)
                    right = i + j
                    if right < length:
                        if right in relations[i]:
                            actions.append(Action.RIGHT_ARC)
                        else:
                            actions.append(Action.NO_RIGHT_ARC)
                actions.append(Action.SHIFT)
            else:
                actions.append(Action.NO_PRED)

        return actions
Beispiel #15
0
class GraphAttention(Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 activation=None,
                 attn_heads=8,
                 alpha=0.2,
                 reduction='concat',
                 dropout=0.6,
                 use_bias=False):
        super().__init__()

        if reduction not in {'concat', 'average'}:
            raise ValueError('Possbile reduction methods: concat, average')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.activation = get_activation(activation)

        self.dropout = dropout
        self.attn_heads = attn_heads
        self.reduction = reduction

        self.kernels = ParameterList()
        self.attn_kernel_self, self.attn_kernel_neighs = ParameterList(
        ), ParameterList()
        self.biases = ParameterList()
        self.use_bias = use_bias

        if not use_bias:
            self.register_parameter('bias', None)

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            W = Parameter(torch.FloatTensor(in_channels, out_channels),
                          requires_grad=True)
            self.kernels.append(W)
            a1 = Parameter(torch.FloatTensor(out_channels, 1),
                           requires_grad=True)
            self.attn_kernel_self.append(a1)
            a2 = Parameter(torch.FloatTensor(out_channels, 1),
                           requires_grad=True)
            self.attn_kernel_neighs.append(a2)

            if use_bias:
                bias = Parameter(torch.Tensor(out_channels))
                self.biases.append(bias)

        self.leakyrelu = LeakyReLU(alpha)

        self.reset_parameters()

    def reset_parameters(self):
        for head in range(self.attn_heads):
            W, a1, a2 = self.kernels[head], self.attn_kernel_self[
                head], self.attn_kernel_neighs[head]
            glorot_uniform(W)
            glorot_uniform(a1)
            glorot_uniform(a2)

            if self.use_bias:
                zeros(self.biases[head])

    def forward(self, inputs):
        x, adj = inputs

        outputs = []
        for head in range(self.attn_heads):
            W, a1, a2 = self.kernels[head], self.attn_kernel_self[
                head], self.attn_kernel_neighs[head]
            Wh = torch.mm(x, W)

            f_1 = Wh @ a1
            f_2 = Wh @ a2

            e = self.leakyrelu(f_1 + f_2.transpose(0, 1))

            zero_vec = -9e15 * torch.ones_like(e)
            attention = torch.where(adj.to_dense() > 0, e, zero_vec)
            attention = F.softmax(attention, dim=1)
            attention = F.dropout(attention,
                                  self.dropout,
                                  training=self.training)
            h_prime = torch.matmul(attention, Wh)

            if self.use_bias:
                h_prime += self.biases[head]

            outputs.append(h_prime)

        if self.reduction == 'concat':
            output = torch.cat(outputs, dim=1)
        else:
            output = torch.mean(torch.stack(outputs), 0)

        return self.activation(output)

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
            + str(self.in_channels) + ' -> ' \
            + str(self.out_channels) + ')'
Beispiel #16
0
class CLBaseline(nn.Module, abc.ABC):
    def __len__(self):
        return len(self.tasks_replay_buffers)

    def __init__(self, base_model, h_dim, out_dim=1, device='cpu', seed=None):
        '''
        :Parameters:
        base_model: torch.nn.Module: task-agnostic model
        h_dim: int: dimension of base_model output
        out_dim: output dimension of task-specific tensor \omega
        (dimension of loss_function input)
        '''
        super().__init__()
        self.base = base_model
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.device = device
        #replay buffers for the previous tasks (includes torch.utils.data.Subsets)
        self.tasks_replay_buffers = []
        #task-specific tensors (which applied to base model outputs)
        self.tasks_omegas = ParameterList()

        if out_dim == 1:
            # computes loss
            self.loss_func = nn.BCEWithLogitsLoss()

            # predicts distribution over the classes
            def pred_func(input):
                pred = F.sigmoid(input)
                return torch.stack([1. - pred, pred], dim=-1).squeeze()

            self.pred_func = pred_func

        if out_dim > 1:
            self.loss_func = nn.CrossEntropyLoss()
            self.pred_func = nn.Softmax()

        self.torch_gen = create_torch_random_gen(seed)
        self.to(self.device)

    @abc.abstractmethod
    def forward(self, cl_batch):
        pass

    @abc.abstractmethod
    def create_replay_buffer(self, dataset):
        pass

    def create_new_task(self):
        '''
        Create new task-specific tensor \omega
        :Parameters:
        classes: list: list of target classes
        '''
        w = Parameter(
            torch.randn((self.out_dim, self.h_dim),
                        generator=self.torch_gen)).to(self.device)
        self.tasks_omegas.append(w)

    def _compute_task_loss(self, k, X, target):
        omega = self.tasks_omegas[k]
        return self.loss_func(
            torch.matmul(self.base(X), omega.T).squeeze(), target)

    @torch.no_grad()
    def predict(self, x, k, get_class=False):
        '''
        Compute prediction by input x
        :Params:
        x: torch.tensor: input tensor
        k: int: task number
        get_class: bool: if False, returns the predictive probability
        distribution over the classes, othervise returns the predicted class
        '''
        assert (k >= 0)
        assert (k < len(self.tasks_omegas))
        omega = self.tasks_omegas[k]
        distr = self.pred_func(torch.matmul(self.base(x), omega.T).squeeze())
        if get_class:
            if len(distr.shape) == 1:
                return torch.argmax(distr).cpu().item()
            else:
                return torch.argmax(distr, dim=-1).cpu().numpy()
        else:
            return distr.cpu()

    @torch.no_grad()
    def select_inducing(self, task_dataset, N=100, criterion='random'):
        '''
        Given task dataset compoute N inducing points 
        for the current task
        '''
        assert (len(self.tasks_omegas) == 1 + len(self.tasks_replay_buffers))

        if criterion == "random":
            indices = torch.randperm(len(task_dataset),
                                     generator=self.torch_gen).numpy()
            select_indices = indices[:N]
            self.create_replay_buffer(Subset(task_dataset, select_indices))
        else:
            raise Exception("Criterion {} not implemented".format(criterion))
Beispiel #17
0
class SparseGraphAttention(Module):
    """
    Sparse version GAT layer, similar to https://arxiv.org/abs/1710.10903
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 activation=None,
                 attn_heads=8,
                 alpha=0.2,
                 reduction='concat',
                 dropout=0.6,
                 use_bias=False):
        super().__init__()

        if reduction not in {'concat', 'average'}:
            raise ValueError('Possbile reduction methods: concat, average')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.activation = get_activation(activation)

        self.dropout = Dropout(dropout)
        self.attn_heads = attn_heads
        self.reduction = reduction

        self.kernels = ParameterList()
        self.att_kernels = ParameterList()
        self.biases = ParameterList()
        self.use_bias = use_bias

        if not use_bias:
            self.register_parameter('bias', None)

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            W = Parameter(torch.Tensor(in_channels, out_channels))
            self.kernels.append(W)
            a = Parameter(torch.Tensor(1, 2 * out_channels))
            self.att_kernels.append(a)

            if use_bias:
                bias = Parameter(torch.Tensor(out_channels))
                self.biases.append(bias)

        self.leakyrelu = LeakyReLU(alpha)
        self.special_spmm = SpecialSpmm()
        self.reset_parameters()

    def reset_parameters(self):
        for head in range(self.attn_heads):
            glorot_uniform(self.kernels[head])
            glorot_uniform(self.att_kernels[head])

            if self.use_bias:
                zeros(self.biases[head])

    def forward(self, inputs):
        x, adj = inputs

        dv = x.device
        N = x.size()[0]
        edge = adj._indices()

        outputs = []
        for head in range(self.attn_heads):
            W, a = self.kernels[head], self.att_kernels[head]
            h = torch.spmm(x, W)

            # Self-attention on the nodes - Shared attention mechanism
            edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t()
            # edge: 2*D x E

            edge_e = torch.exp(-self.leakyrelu(a.mm(edge_h).squeeze()))

            e_rowsum = self.special_spmm(edge, edge_e, torch.Size([N, N]),
                                         torch.ones(size=(N, 1), device=dv))
            edge_e = self.dropout(edge_e)
            h_prime = self.special_spmm(edge, edge_e, torch.Size([N, N]), h)
            h_prime = h_prime.div(e_rowsum)
            h_prime[torch.isnan(h_prime)] = 0.

            if self.use_bias:
                h_prime += self.biases[head]

            outputs.append(h_prime)

        if self.reduction == 'concat':
            output = torch.cat(outputs, dim=1)
        else:
            output = torch.mean(torch.stack(outputs), 0)

        return self.activation(output)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(
            self.in_channels) + ' -> ' + str(self.out_channels) + ')'
Beispiel #18
0
class TiedLinear(torch.nn.Module):
    """
    TiedLinear is a linear layer with shared parameters for features between
    (output) classes that takes as input a tensor X with dimensions
        (batch size) X (output_dim) X (in_features)
        where:
            output_dim is the desired output dimension/# of classes
            in_features are the features with shared weights across the classes
    """
    def __init__(self, env, feat_info, output_dim, bias=False):
        super(TiedLinear, self).__init__()
        self.env = env
        # Init parameters
        self.in_features = 0.0
        self.weight_list = ParameterList()
        if bias:
            self.bias_list = ParameterList()
        else:
            self.register_parameter('bias', None)
        self.output_dim = output_dim
        self.bias_flag = bias
        # Iterate over featurizer info list
        for feat_entry in feat_info:
            learnable = feat_entry.learnable
            feat_size = feat_entry.size
            init_weight = feat_entry.init_weight
            self.in_features += feat_size
            feat_weight = Parameter(init_weight * torch.ones(1, feat_size),
                                    requires_grad=learnable)
            if learnable:
                self.reset_parameters(feat_weight)
            self.weight_list.append(feat_weight)
            if bias:
                feat_bias = Parameter(torch.zeros(1, feat_size),
                                      requires_grad=learnable)
                if learnable:
                    self.reset_parameters(feat_bias)
                self.bias_list.append(feat_bias)

    def reset_parameters(self, tensor):
        stdv = 1. / math.sqrt(tensor.size(0))
        tensor.data.uniform_(-stdv, stdv)

    def concat_weights(self):
        self.W = torch.cat([t for t in self.weight_list], -1)
        # Normalize weights.
        if self.env['weight_norm']:
            self.W = self.W.div(self.W.norm(p=2))
        # expand so we can do matrix multiplication with each cell and their max # of domain values
        self.W = self.W.expand(self.output_dim, -1)
        if self.bias_flag:
            self.B = torch.cat(
                [t.expand(self.output_dim, -1) for t in self.bias_list], -1)

    def forward(self, X, index, mask):
        # Concatenates different featurizer weights - need to call during every pass.
        self.concat_weights()
        output = X.mul(self.W)
        if self.bias_flag:
            output += self.B
        output = output.sum(2)
        # Add our mask so that invalid domain classes for a given variable/VID
        # has a large negative value, resulting in a softmax probability
        # of de facto 0.
        output.index_add_(0, index, mask)
        return output