Example #1
0
class LinearVarianceUnif(ModuleWrapper):

    def __init__(self, in_features, out_features, bias=True):
        super(LinearVarianceUnif, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.W = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.W.size(1))
        self.W.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.zero_()

    def forward(self, x):
        if self.training:
            eps = Variable(self.W.data.new(self.W.size()).uniform_() - 0.5)
        else:
            eps = 0.0
        output = F.linear(x, self.W*eps)
        if self.bias is not None:
            output = output + self.bias
        return output

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features=' + str(self.in_features) \
               + ', out_features=' + str(self.out_features) \
               + ', bias=' + str(self.bias is not None) + ')'
Example #2
0
 def __init__(self, in_channels, out_channels, kernel_size, alpha_shape, stride=1,
              padding=0, dilation=1, prior='loguni', bias=True):
     super(ConvVDO, self).__init__()
     self.in_channels = in_channels
     self.out_channels = out_channels
     self.kernel_size = (kernel_size, kernel_size)
     self.stride = stride
     self.padding = padding
     self.dilation = dilation
     self.alpha_shape = alpha_shape
     self.groups = 1
     self.weight = Parameter(torch.Tensor(
         out_channels, in_channels, *self.kernel_size))
     if bias:
         self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1))
     else:
         self.register_parameter('bias', None)
     self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups)
     self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups)
     self.log_alpha = Parameter(torch.Tensor(*alpha_shape))
     self.reset_parameters()
     self.zero_mean = False
     self.permute_sigma = False
     self.prior = prior
     if prior == 'loguni':
         self.kl_fun = metrics.kl_loguni
     else:
         self.kl_fun = metrics.kl_ard
Example #3
0
class LinearVDO(ModuleWrapper):

    def __init__(self, in_features, out_features, prior='loguni', alpha_shape=(1, 1), bias=True):
        super(LinearVDO, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha_shape = alpha_shape
        self.W = Parameter(torch.Tensor(out_features, in_features))
        self.log_alpha = Parameter(torch.Tensor(*alpha_shape))
        if bias:
            self.bias = Parameter(torch.Tensor(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        self.zero_mean = False
        self.permute_sigma = False
        self.prior = prior
        if prior == 'loguni':
            self.kl_fun = metrics.kl_loguni
        else:
            self.kl_fun = metrics.kl_ard

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.W.size(1))
        self.W.data.uniform_(-stdv, stdv)
        self.log_alpha.data.fill_(-5.0)
        if self.bias is not None:
            self.bias.data.zero_()

    def forward(self, x):
        if self.zero_mean:
            lrt_mean = 0.0
        else:
            lrt_mean = F.linear(x, self.W)
        if self.bias is not None:
            lrt_mean = lrt_mean + self.bias

        sigma2 = Variable.exp(self.log_alpha) * self.W * self.W
        if self.permute_sigma:
            sigma2 = sigma2.view(-1)[torch.randperm(self.in_features * self.out_features).cuda()].view(self.out_features, self.in_features)

        lrt_std = Variable.sqrt(1e-16 + F.linear(x * x, sigma2))
        if self.training:
            eps = Variable(lrt_std.data.new(lrt_std.size()).normal_())
        else:
            eps = 0.0
        return lrt_mean + lrt_std * eps

    def kl_reg(self):
        return self.W.nelement() * self.kl_fun(self.log_alpha) / self.log_alpha.nelement()

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features=' + str(self.in_features) \
               + ', out_features=' + str(self.out_features) \
               + ', alpha_shape=' + str(self.alpha_shape) \
               + ', prior=' + self.prior \
               + ', bias=' + str(self.bias is not None) + ')' ', bias=' + str(self.bias is not None) + ')'
class RepNormal(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mu = Parameter(FloatTensor([0.0]))
        self.log_variance = Parameter(FloatTensor([0.0]))

    def __call__(self):
        z = Variable(torch.randn(1))
        return self.mu + self.log_variance.exp() * z

    def _repr_pretty_(self, p, cycle):
        p.text("mu = {}".format(self.mu))
        p.text("std = {}".format(self.log_variance.exp()))
Example #5
0
    def __init__(self, X, y, kernel, Xu, likelihood, mean_function=None,
                 latent_shape=None, num_data=None, whiten=False, jitter=1e-6,
                 name="SVGP"):
        super(VariationalSparseGP, self).__init__(X, y, kernel, mean_function, jitter,
                                                  name)
        self.likelihood = likelihood

        self.num_data = num_data if num_data is not None else self.X.shape[0]
        self.whiten = whiten

        self.Xu = Parameter(Xu)

        y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size([])
        self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape

        M = self.Xu.shape[0]
        u_loc_shape = self.latent_shape + (M,)
        u_loc = self.Xu.new_zeros(u_loc_shape)
        self.u_loc = Parameter(u_loc)

        u_scale_tril_shape = self.latent_shape + (M, M)
        Id = torch.eye(M, out=self.Xu.new_empty(M, M))
        u_scale_tril = Id.expand(u_scale_tril_shape)
        self.u_scale_tril = Parameter(u_scale_tril)
        self.set_constraint("u_scale_tril", constraints.lower_cholesky)

        self._sample_latent = True
Example #6
0
class LinearVariance(ModuleWrapper):
    def __init__(self, in_features, out_features, bias=True):
        super(LinearVariance, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.sigma = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.sigma.size(1))
        self.sigma.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.zero_()

    def forward(self, x):
        lrt_mean = self.bias
        lrt_std = torch.sqrt_(1e-16 + F.linear(x * x, self.sigma * self.sigma))
        if self.training:
            eps = Variable(lrt_std.data.new(lrt_std.size()).normal_())
        else:
            eps = 0.0
        return lrt_mean + eps * lrt_std
Example #7
0
class LinearAttention(Attention):
    """
    This ``Attention`` module performs a dot product between a vector of weights and some
    combination of the two input vectors, followed by an (optional) activation function.  The
    combination used is configurable.

    If the two vectors are ``x`` and ``y``, we allow the following kinds of combinations: ``x``,
    ``y``, ``x*y``, ``x+y``, ``x-y``, ``x/y``, where each of those binary operations is performed
    elementwise.  You can list as many combinations as you want, comma separated.  For example, you
    might give ``x,y,x*y`` as the ``combination`` parameter to this class.  The computed similarity
    function would then be ``w^T [x; y; x*y] + b``, where ``w`` is a vector of weights, ``b`` is a
    bias parameter, and ``[;]`` is vector concatenation.

    Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the
    similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can
    accomplish that with this class by using "x*y" for `combination`.

    Parameters
    ----------
    tensor_1_dim : ``int``
        The dimension of the first tensor, ``x``, described above.  This is ``x.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    tensor_2_dim : ``int``
        The dimension of the second tensor, ``y``, described above.  This is ``y.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    combination : ``str``, optional (default="x,y")
        Described above.
    activation : ``Activation``, optional (default=linear (i.e. no activation))
        An activation function applied after the ``w^T * [x;y] + b`` calculation.  Default is no
        activation.
    """

    def __init__(self,
                 tensor_1_dim: int,
                 tensor_2_dim: int,
                 combination: str = 'x,y',
                 activation: Activation = None,
                 normalize: bool = True) -> None:
        super().__init__(normalize)
        self._combination = combination
        combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim])
        self._weight_vector = Parameter(torch.Tensor(combined_dim))
        self._bias = Parameter(torch.Tensor(1))
        self._activation = activation or Activation.by_name('linear')()
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(6 / (self._weight_vector.size(0) + 1))
        self._weight_vector.data.uniform_(-std, std)
        self._bias.data.fill_(0)

    @overrides
    def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor:
        combined_tensors = util.combine_tensors_and_multiply(self._combination,
                                                             [vector.unsqueeze(1), matrix],
                                                             self._weight_vector)
        return self._activation(combined_tensors.squeeze(1) + self._bias)
    def _make_params(self):
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #9
0
 def __init__(self, in_features, out_features, prior='loguni', alpha_shape=(1, 1), bias=True):
     super(LinearVDO, self).__init__()
     self.in_features = in_features
     self.out_features = out_features
     self.alpha_shape = alpha_shape
     self.W = Parameter(torch.Tensor(out_features, in_features))
     self.log_alpha = Parameter(torch.Tensor(*alpha_shape))
     if bias:
         self.bias = Parameter(torch.Tensor(1, out_features))
     else:
         self.register_parameter('bias', None)
     self.reset_parameters()
     self.zero_mean = False
     self.permute_sigma = False
     self.prior = prior
     if prior == 'loguni':
         self.kl_fun = metrics.kl_loguni
     else:
         self.kl_fun = metrics.kl_ard
Example #10
0
 def __init__(self, in_features, out_features, bias=True):
     super(LinearVariance, self).__init__()
     self.in_features = in_features
     self.out_features = out_features
     self.sigma = Parameter(torch.Tensor(out_features, in_features))
     if bias:
         self.bias = Parameter(torch.Tensor(1, out_features))
     else:
         self.register_parameter('bias', None)
     self.reset_parameters()
Example #11
0
 def __init__(self, in_features, out_features, bias=True):
     super(LinearVarianceBe, self).__init__()
     self.in_features = in_features
     self.out_features = out_features
     self.probs = torch.ones([out_features, in_features]).cuda() * 0.5
     self.W = Parameter(torch.Tensor(out_features, in_features))
     if bias:
         self.bias = Parameter(torch.Tensor(1, out_features))
     else:
         self.register_parameter('bias', None)
     self.reset_parameters()
 def __init__(self,
              tensor_1_dim: int,
              tensor_2_dim: int,
              combination: str = 'x,y',
              activation: Activation = None) -> None:
     super().__init__()
     self._combination = combination
     combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim])
     self._weight_vector = Parameter(torch.Tensor(combined_dim))
     self._bias = Parameter(torch.Tensor(1))
     self._activation = activation or Activation.by_name('linear')()
     self.reset_parameters()
Example #13
0
class ConvVarianceUnif(ModuleWrapper):

    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, bias=True):
        super(ConvVarianceUnif, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = (kernel_size, kernel_size)
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = 1
        self.W = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
        if bias:
            self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1))
        else:
            self.register_parameter('bias', None)
        self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups)
        self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups)
        self.reset_parameters()

    def reset_parameters(self):
        n = self.in_channels
        for k in self.kernel_size:
            n *= k
        stdv = 1. / math.sqrt(n)
        self.W.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x):
        if self.training:
            eps = Variable(torch.rand(self.W.size()) - 0.5)
        else:
            eps = 0.0

        output = self.op_nobias(x, self.W*eps)
        if self.bias is not None:
            output = output + self.bias
        return output

    def __repr__(self):
        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        s += ', padding={padding}'
        s += ', dilation={dilation}'
        if self.bias is None:
            s += ', bias=False'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)
Example #14
0
    def __init__(self, base_model, name="GPLVM"):
        super(GPLVM, self).__init__(name)
        if base_model.X.dim() != 2:
            raise ValueError("GPLVM model only works with 2D latent X, but got "
                             "X.dim() = {}.".format(base_model.X.dim()))
        self.base_model = base_model
        self.y = self.base_model.y

        self.X_loc = Parameter(self.base_model.X)

        C = self.X_loc.shape[1]
        X_scale_tril_shape = self.X_loc.shape + (C,)
        Id = torch.eye(C, out=self.X_loc.new_empty(C, C))
        X_scale_tril = Id.expand(X_scale_tril_shape)
        self.X_scale_tril = Parameter(X_scale_tril)
        self.set_constraint("X_scale_tril", constraints.lower_cholesky)

        self._call_base_model_guide = True
Example #15
0
 def __init__(self, in_channels, out_channels, kernel_size, stride=1,
              padding=0, dilation=1, bias=True):
     super(ConvVarianceUnif, self).__init__()
     self.in_channels = in_channels
     self.out_channels = out_channels
     self.kernel_size = (kernel_size, kernel_size)
     self.stride = stride
     self.padding = padding
     self.dilation = dilation
     self.groups = 1
     self.W = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
     if bias:
         self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1))
     else:
         self.register_parameter('bias', None)
     self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups)
     self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups)
     self.reset_parameters()
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 num_layers: int = 1,
                 recurrent_dropout_probability: float = 0) -> None:
        super(AlternatingHighwayLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.recurrent_dropout_probability = recurrent_dropout_probability
        self.training = True

        # Input dimensions consider the fact that we do
        # all of the LSTM projections (and highway parts)
        # in a single matrix multiplication.
        input_projection_size = 6 * hidden_size
        state_projection_size = 5 * hidden_size
        bias_size = 5 * hidden_size

        # Here we are creating a single weight and bias with the
        # parameters for all layers unfolded into it. This is necessary
        # because unpacking and re-packing the weights inside the
        # kernel would be slow, as it would happen every time it is called.
        total_weight_size = 0
        total_bias_size = 0
        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else hidden_size

            input_weights = input_projection_size * layer_input_size
            state_weights = state_projection_size * hidden_size
            total_weight_size += input_weights + state_weights

            total_bias_size += bias_size

        self.weight = Parameter(torch.FloatTensor(total_weight_size))
        self.bias = Parameter(torch.FloatTensor(total_bias_size))
        self.reset_parameters()
class LinearMatrixAttention(MatrixAttention):
    """
    This ``MatrixAttention`` takes two matrices as input and returns a matrix of attentions
    by performing a dot product between a vector of weights and some
    combination of the two input matrices, followed by an (optional) activation function.  The
    combination used is configurable.

    If the two vectors are ``x`` and ``y``, we allow the following kinds of combinations: ``x``,
    ``y``, ``x*y``, ``x+y``, ``x-y``, ``x/y``, where each of those binary operations is performed
    elementwise.  You can list as many combinations as you want, comma separated.  For example, you
    might give ``x,y,x*y`` as the ``combination`` parameter to this class.  The computed similarity
    function would then be ``w^T [x; y; x*y] + b``, where ``w`` is a vector of weights, ``b`` is a
    bias parameter, and ``[;]`` is vector concatenation.

    Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the
    similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can
    accomplish that with this class by using "x*y" for `combination`.

    Parameters
    ----------
    tensor_1_dim : ``int``
        The dimension of the first tensor, ``x``, described above.  This is ``x.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    tensor_2_dim : ``int``
        The dimension of the second tensor, ``y``, described above.  This is ``y.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    combination : ``str``, optional (default="x,y")
        Described above.
    activation : ``Activation``, optional (default=linear (i.e. no activation))
        An activation function applied after the ``w^T * [x;y] + b`` calculation.  Default is no
        activation.
    """

    def __init__(self,
                 tensor_1_dim: int,
                 tensor_2_dim: int,
                 combination: str = 'x,y',
                 activation: Activation = None) -> None:
        super().__init__()
        self._combination = combination
        combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim])
        self._weight_vector = Parameter(torch.Tensor(combined_dim))
        self._bias = Parameter(torch.Tensor(1))
        self._activation = activation or Activation.by_name('linear')()
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(6 / (self._weight_vector.size(0) + 1))
        self._weight_vector.data.uniform_(-std, std)
        self._bias.data.fill_(0)

    @overrides
    def forward(self,  # pylint: disable=arguments-differ
                matrix_1: torch.Tensor,
                matrix_2: torch.Tensor) -> torch.Tensor:
        # TODO(mattg): Remove the need for this tiling.
        # https://github.com/allenai/allennlp/pull/1235#issuecomment-391540133
        tiled_matrix_1 = matrix_1.unsqueeze(2).expand(matrix_1.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_1.size()[2])
        tiled_matrix_2 = matrix_2.unsqueeze(1).expand(matrix_2.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_2.size()[2])

        combined_tensors = util.combine_tensors(self._combination, [tiled_matrix_1, tiled_matrix_2])
        dot_product = torch.matmul(combined_tensors, self._weight_vector)
        return self._activation(dot_product + self._bias)
Example #18
0
class LSTMcell_untied(torch.nn.Module):
    def __init__(self,
                 *,
                 inputSize,
                 hiddenSize,
                 train=True,
                 dr=0.5,
                 drMethod='gal+sem',
                 gpu=0):
        super(LSTMcell_untied, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = inputSize
        self.dr = dr

        self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize))

        self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize))

        self.b_i = Parameter(torch.Tensor(hiddenSize))
        self.b_f = Parameter(torch.Tensor(hiddenSize))
        self.b_o = Parameter(torch.Tensor(hiddenSize))
        self.b_c = Parameter(torch.Tensor(hiddenSize))

        self.drMethod = drMethod.split('+')
        self.gpu = gpu
        self.train = train
        if gpu >= 0:
            self = self.cuda(gpu)
            self.is_cuda = True
        else:
            self.is_cuda = False
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hiddenSize)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def init_mask(self, x, h, c):
        self.maskX_i = createMask(x, self.dr)
        self.maskX_f = createMask(x, self.dr)
        self.maskX_c = createMask(x, self.dr)
        self.maskX_o = createMask(x, self.dr)

        self.maskH_i = createMask(h, self.dr)
        self.maskH_f = createMask(h, self.dr)
        self.maskH_c = createMask(h, self.dr)
        self.maskH_o = createMask(h, self.dr)

        self.maskC = createMask(c, self.dr)

        self.maskW_xi = createMask(self.w_xi, self.dr)
        self.maskW_xf = createMask(self.w_xf, self.dr)
        self.maskW_xc = createMask(self.w_xc, self.dr)
        self.maskW_xo = createMask(self.w_xo, self.dr)
        self.maskW_hi = createMask(self.w_hi, self.dr)
        self.maskW_hf = createMask(self.w_hf, self.dr)
        self.maskW_hc = createMask(self.w_hc, self.dr)
        self.maskW_ho = createMask(self.w_ho, self.dr)

    def forward(self, x, hidden):
        h0, c0 = hidden
        doDrop = self.training and self.dr > 0.0

        if doDrop:
            self.init_mask(x, h0, c0)

        if doDrop and 'drH' in self.drMethod:
            h0_i = h0.mul(self.maskH_i)
            h0_f = h0.mul(self.maskH_f)
            h0_c = h0.mul(self.maskH_c)
            h0_o = h0.mul(self.maskH_o)
        else:
            h0_i = h0
            h0_f = h0
            h0_c = h0
            h0_o = h0

        if doDrop and 'drX' in self.drMethod:
            x_i = x.mul(self.maskX_i)
            x_f = x.mul(self.maskX_f)
            x_c = x.mul(self.maskX_c)
            x_o = x.mul(self.maskX_o)
        else:
            x_i = x
            x_f = x
            x_c = x
            x_o = x

        if doDrop and 'drW' in self.drMethod:
            w_xi = self.w_xi.mul(self.maskW_xi)
            w_xf = self.w_xf.mul(self.maskW_xf)
            w_xc = self.w_xc.mul(self.maskW_xc)
            w_xo = self.w_xo.mul(self.maskW_xo)
            w_hi = self.w_hi.mul(self.maskW_hi)
            w_hf = self.w_hf.mul(self.maskW_hf)
            w_hc = self.w_hc.mul(self.maskW_hc)
            w_ho = self.w_ho.mul(self.maskW_ho)
        else:
            w_xi = self.w_xi
            w_xf = self.w_xf
            w_xc = self.w_xc
            w_xo = self.w_xo
            w_hi = self.w_hi
            w_hf = self.w_hf
            w_hc = self.w_hc
            w_ho = self.w_ho

        gate_i = F.linear(x_i, w_xi) + F.linear(h0_i, w_hi) + self.b_i
        gate_f = F.linear(x_f, w_xf) + F.linear(h0_f, w_hf) + self.b_f
        gate_c = F.linear(x_c, w_xc) + F.linear(h0_c, w_hc) + self.b_c
        gate_o = F.linear(x_o, w_xo) + F.linear(h0_o, w_ho) + self.b_o

        gate_i = F.sigmoid(gate_i)
        gate_f = F.sigmoid(gate_f)
        gate_c = F.tanh(gate_c)
        gate_o = F.sigmoid(gate_o)

        if doDrop and 'drC' in self.drMethod:
            gate_c = gate_c.mul(self.maskC)

        c1 = (gate_f * c0) + (gate_i * gate_c)
        h1 = gate_o * F.tanh(c1)

        return h1, c1
Example #19
0
game = 'ipd'
param = 'flat`'

if game == 'ipd':
    game = IteratedPrisonerDilemna()
elif game == 'mp':
    game = IteratedMatchingPennies()
else:
    raise ValueError()

n_players = game.n_players
dim_strategy = game.dim_strategy

if param == 'flat':
    strategies = [
        Parameter(torch.empty(dim_strategy)) for _ in range(n_players)
    ]
    for s in strategies:
        s.data.uniform_(0., 1.)
elif param == 'sigmoid':
    strategies = [
        Parameter(torch.empty(dim_strategy, 2)) for _ in range(n_players)
    ]
    for s in strategies:
        s.data.normal_(0., 1.)

strat_rec = [[] for _ in range(n_players)]
vs_rec = [[] for _ in range(n_players)]


def objective_fn(strategies):
Example #20
0
 def z(self, value):
     self._z = Parameter(torch.as_tensor(value))
Example #21
0
class BBBLinearFactorial(nn.Module):
    """
    Describes a Linear fully connected Bayesian layer with
    a distribution over each of the weights and biases
    in the layer.
    """
    def __init__(self, in_features, out_features, p_logvar_init=-3, p_pi=1.0, q_logvar_init=-5):
        # p_logvar_init, p_pi can be either
        # (list/tuples): prior model is a mixture of Gaussians components=len(p_pi)=len(p_logvar_init)
        # float: Gussian distribution
        # q_logvar_init: float, the approximate posterior is currently always a factorized gaussian
        super(BBBLinearFactorial, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.p_logvar_init = p_logvar_init
        self.q_logvar_init = q_logvar_init

        # Approximate posterior weights...
        self.qw_mean = Parameter(torch.Tensor(out_features, in_features))
        self.qw_logvar = Parameter(torch.Tensor(out_features, in_features))

        # optionally add bias
        # self.qb_mean = Parameter(torch.Tensor(out_features))
        # self.qb_logvar = Parameter(torch.Tensor(out_features))

        # ...and output...
        self.fc_qw_mean = Parameter(torch.Tensor(out_features, in_features))
        self.fc_qw_std = Parameter(torch.Tensor(out_features, in_features))

        # ...as normal distributions
        self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar)
        # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar)
        self.fc_qw = Normalout(mu=self.fc_qw_mean, std=self.fc_qw_std)

        # initialise
        self.log_alpha = Parameter(torch.Tensor(1, 1))

        # prior model
        self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)
        # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)

        # initialize all paramaters
        self.reset_parameters()

    def reset_parameters(self):
        # initialize (trainable) approximate posterior parameters
        stdv = 10. / math.sqrt(self.in_features)
        self.qw_mean.data.uniform_(-stdv, stdv)
        self.qw_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        # self.qb_mean.data.uniform_(-stdv, stdv)
        # self.qb_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        self.fc_qw_mean.data.uniform_(-stdv, stdv)
        self.fc_qw_std.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        self.log_alpha.data.uniform_(-stdv, stdv)

    def forward(self, input):
        raise NotImplementedError()

    def fcprobforward(self, input):
        """
        Probabilistic forwarding method.
        :param input: data tensor
        :return: output, kl-divergence
        """

        fc_qw_mean = F.linear(input=input, weight=self.qw_mean)
        fc_qw_si = torch.sqrt(1e-8 + F.linear(input=input.pow(2), weight=torch.exp(self.log_alpha)*self.qw_mean.pow(2)))

        if cuda:
            fc_qw_mean.cuda()
            fc_qw_si.cuda()

        # sample from output
        if cuda:
            output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size())).cuda()
        else:
            output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size()))

        if cuda:
            output.cuda()

        w_sample = self.fc_qw.sample()

        # KL divergence
        qw_logpdf = self.fc_qw.logpdf(w_sample)

        kl = torch.sum(qw_logpdf - self.pw.logpdf(w_sample))

        return output, kl

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'
Example #22
0
class VariationalSparseGP(GPModel):
    r"""
    Variational Sparse Gaussian Process model.

    In :class:`.VariationalGP` model, when the number of input data :math:`X` is large,
    the covariance matrix :math:`k(X, X)` will require a lot of computational steps to
    compute its inverse (for log likelihood and for prediction). This model introduces
    an additional inducing-input parameter :math:`X_u` to solve that problem. Given
    inputs :math:`X`, their noisy observations :math:`y`, and the inducing-input
    parameters :math:`X_u`, the model takes the form:

    .. math::
        [f, u] &\sim \mathcal{GP}(0, k([X, X_u], [X, X_u])),\\
        y & \sim p(y) = p(y \mid f) p(f),

    where :math:`p(y \mid f)` is the likelihood.

    We will use a variational approach in this model by approximating :math:`q(f,u)`
    to the posterior :math:`p(f,u \mid y)`. Precisely, :math:`q(f) = p(f\mid u)q(u)`,
    where :math:`q(u)` is a multivariate normal distribution with two parameters
    ``u_loc`` and ``u_scale_tril``, which will be learned during a variational
    inference process.

    .. note:: This model can be learned using MCMC method as in reference [2]. See also
        :class:`.GPModel`.

    .. note:: This model has :math:`\mathcal{O}(NM^2)` complexity for training,
        :math:`\mathcal{O}(M^3)` complexity for testing. Here, :math:`N` is the number
        of train inputs, :math:`M` is the number of inducing inputs. Size of
        variational parameters is :math:`\mathcal{O}(M^2)`.

    References:

    [1] `Scalable variational Gaussian process classification`,
    James Hensman, Alexander G. de G. Matthews, Zoubin Ghahramani

    [2] `MCMC for Variationally Sparse Gaussian Processes`,
    James Hensman, Alexander G. de G. Matthews, Maurizio Filippone, Zoubin Ghahramani

    :param torch.Tensor X: A input data for training. Its first dimension is the number
        of data points.
    :param torch.Tensor y: An output data for training. Its last dimension is the
        number of data points.
    :param ~pyro.contrib.gp.kernels.kernel.Kernel kernel: A Pyro kernel object, which
        is the covariance function :math:`k`.
    :param torch.Tensor Xu: Initial values for inducing points, which are parameters
        of our model.
    :param ~pyro.contrib.gp.likelihoods.likelihood Likelihood likelihood: A likelihood
        object.
    :param callable mean_function: An optional mean function :math:`m` of this Gaussian
        process. By default, we use zero mean.
    :param torch.Size latent_shape: Shape for latent processes (`batch_shape` of
        :math:`q(u)`). By default, it equals to output batch shape ``y.shape[:-1]``.
        For the multi-class classification problems, ``latent_shape[-1]`` should
        corresponse to the number of classes.
    :param int num_data: The size of full training dataset. It is useful for training
        this model with mini-batch.
    :param bool whiten: A flag to tell if variational parameters ``u_loc`` and
        ``u_scale_tril`` are transformed by the inverse of ``Luu``, where ``Luu`` is
        the lower triangular decomposition of :math:`kernel(X_u, X_u)`. Enable this
        flag will help optimization.
    :param float jitter: A small positive term which is added into the diagonal part of
        a covariance matrix to help stablize its Cholesky decomposition.
    :param str name: Name of this model.
    """
    def __init__(self, X, y, kernel, Xu, likelihood, mean_function=None,
                 latent_shape=None, num_data=None, whiten=False, jitter=1e-6,
                 name="SVGP"):
        super(VariationalSparseGP, self).__init__(X, y, kernel, mean_function, jitter,
                                                  name)
        self.likelihood = likelihood

        self.num_data = num_data if num_data is not None else self.X.shape[0]
        self.whiten = whiten

        self.Xu = Parameter(Xu)

        y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size([])
        self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape

        M = self.Xu.shape[0]
        u_loc_shape = self.latent_shape + (M,)
        u_loc = self.Xu.new_zeros(u_loc_shape)
        self.u_loc = Parameter(u_loc)

        u_scale_tril_shape = self.latent_shape + (M, M)
        Id = torch.eye(M, out=self.Xu.new_empty(M, M))
        u_scale_tril = Id.expand(u_scale_tril_shape)
        self.u_scale_tril = Parameter(u_scale_tril)
        self.set_constraint("u_scale_tril", constraints.lower_cholesky)

        self._sample_latent = True

    def model(self):
        self.set_mode("model")

        Xu = self.get_param("Xu")
        u_loc = self.get_param("u_loc")
        u_scale_tril = self.get_param("u_scale_tril")

        M = Xu.shape[0]
        Kuu = self.kernel(Xu) + torch.eye(M, out=Xu.new_empty(M, M)) * self.jitter
        Luu = Kuu.potrf(upper=False)

        zero_loc = Xu.new_zeros(u_loc.shape)
        u_name = param_with_module_name(self.name, "u")
        if self.whiten:
            Id = torch.eye(M, out=Xu.new_empty(M, M))
            pyro.sample(u_name,
                        dist.MultivariateNormal(zero_loc, scale_tril=Id)
                            .independent(zero_loc.dim() - 1))
        else:
            pyro.sample(u_name,
                        dist.MultivariateNormal(zero_loc, scale_tril=Luu)
                            .independent(zero_loc.dim() - 1))

        f_loc, f_var = conditional(self.X, Xu, self.kernel, u_loc, u_scale_tril,
                                   Luu, full_cov=False, whiten=self.whiten,
                                   jitter=self.jitter)

        f_loc = f_loc + self.mean_function(self.X)
        if self.y is None:
            return f_loc, f_var
        else:
            with poutine.scale(None, self.num_data / self.X.shape[0]):
                return self.likelihood(f_loc, f_var, self.y)

    def guide(self):
        self.set_mode("guide")

        Xu = self.get_param("Xu")
        u_loc = self.get_param("u_loc")
        u_scale_tril = self.get_param("u_scale_tril")

        if self._sample_latent:
            u_name = param_with_module_name(self.name, "u")
            pyro.sample(u_name,
                        dist.MultivariateNormal(u_loc, scale_tril=u_scale_tril)
                            .independent(u_loc.dim()-1))
        return Xu, u_loc, u_scale_tril

    def forward(self, Xnew, full_cov=False):
        r"""
        Computes the mean and covariance matrix (or variance) of Gaussian Process
        posterior on a test input data :math:`X_{new}`:

        .. math:: p(f^* \mid X_{new}, X, y, k, X_u, u_{loc}, u_{scale\_tril})
            = \mathcal{N}(loc, cov).

        .. note:: Variational parameters ``u_loc``, ``u_scale_tril``, the
            inducing-point parameter ``Xu``, together with kernel's parameters have
            been learned from a training procedure (MCMC or SVI).

        :param torch.Tensor Xnew: A input data for testing. Note that
            ``Xnew.shape[1:]`` must be the same as ``self.X.shape[1:]``.
        :param bool full_cov: A flag to decide if we want to predict full covariance
            matrix or just variance.
        :returns: loc and covariance matrix (or variance) of :math:`p(f^*(X_{new}))`
        :rtype: tuple(torch.Tensor, torch.Tensor)
        """
        self._check_Xnew_shape(Xnew)
        # avoid sampling the unnecessary latent u
        self._sample_latent = False
        Xu, u_loc, u_scale_tril = self.guide()
        self._sample_latent = True

        loc, cov = conditional(Xnew, Xu, self.kernel, u_loc, u_scale_tril,
                               full_cov=full_cov, whiten=self.whiten,
                               jitter=self.jitter)
        return loc + self.mean_function(Xnew), cov
Example #23
0
class cola_gnn(nn.Module):
    def __init__(self, args, data):
        super().__init__()
        self.x_h = 1
        self.f_h = data.m
        self.m = data.m
        self.d = data.d
        self.w = args.window
        self.h = args.horizon
        self.adj = data.adj
        self.o_adj = data.orig_adj
        if args.cuda:
            self.adj = sparse_mx_to_torch_sparse_tensor(
                normalize_adj2(data.orig_adj.cpu().numpy())).to_dense().cuda()
        else:
            self.adj = sparse_mx_to_torch_sparse_tensor(
                normalize_adj2(data.orig_adj.cpu().numpy())).to_dense()
        self.dropout = args.dropout
        self.n_hidden = args.n_hidden
        half_hid = int(self.n_hidden / 2)
        self.V = Parameter(torch.Tensor(half_hid))
        self.bv = Parameter(torch.Tensor(1))
        self.W1 = Parameter(torch.Tensor(half_hid, self.n_hidden))
        self.b1 = Parameter(torch.Tensor(half_hid))
        self.W2 = Parameter(torch.Tensor(half_hid, self.n_hidden))
        self.act = F.elu
        self.Wb = Parameter(torch.Tensor(self.m, self.m))
        self.wb = Parameter(torch.Tensor(1))
        self.k = args.k
        self.conv = nn.Conv1d(1, self.k, self.w)
        self.conv_long = nn.Conv1d(1, self.k, self.w - self.k, dilation=2)
        self.n_spatial = args.hidsp  #self.h  ####### check equal to k

        self.conv1 = GraphConvLayer(self.k * 3, self.n_hidden)  # self.k
        self.conv2 = GraphConvLayer(self.n_hidden, self.n_spatial)

        if args.rnn_model == 'LSTM':
            self.rnn = nn.LSTM(input_size=self.x_h,
                               hidden_size=self.n_hidden,
                               num_layers=args.n_layer,
                               dropout=args.dropout,
                               batch_first=True,
                               bidirectional=args.bi)
        elif args.rnn_model == 'GRU':
            self.rnn = nn.GRU(input_size=self.x_h,
                              hidden_size=self.n_hidden,
                              num_layers=args.n_layer,
                              dropout=args.dropout,
                              batch_first=True,
                              bidirectional=args.bi)
        elif args.rnn_model == 'RNN':
            self.rnn = nn.RNN(input_size=self.x_h,
                              hidden_size=self.n_hidden,
                              num_layers=args.n_layer,
                              dropout=args.dropout,
                              batch_first=True,
                              bidirectional=args.bi)
        else:
            raise LookupError(' only support LSTM, GRU and RNN')

        hidden_size = (int(args.bi) + 1) * self.n_hidden
        # self.n_hidden = hidden_size BIDIRECTIONAL BUG
        self.out = nn.Linear(hidden_size + self.n_spatial, 1)

        self.residual_window = 0
        self.ratio = 1.0
        if (self.residual_window > 0):
            self.residual_window = min(self.residual_window, args.window)
            self.residual = nn.Linear(self.residual_window, 1)
        self.init_weights()

    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)  # best
            else:
                stdv = 1. / math.sqrt(p.size(0))
                p.data.uniform_(-stdv, stdv)

    def forward(self, x, feat=None):
        '''
        Args:  x: (batch, time_step, m)  
            feat: [batch, window, dim, m]
        Returns: (batch, m)
        '''
        b, w, m = x.size()
        orig_x = x
        x = x.permute(0, 2, 1).contiguous().view(-1, x.size(1), 1)
        r_out, hc = self.rnn(x, None)
        last_hid = r_out[:, -1, :]
        last_hid = last_hid.view(-1, self.m, self.n_hidden)
        out_temporal = last_hid  # [b, m, 20]
        # print(last_hid.shape,'====')
        hid_rpt_m = last_hid.repeat(1, self.m, 1).view(
            b, self.m, self.m, self.n_hidden)  # b,m,m,w continuous m
        hid_rpt_w = last_hid.repeat(1, 1, self.m).view(
            b, self.m, self.m,
            self.n_hidden)  # b,m,m,w continuous w one window data
        a_mx = self.act(
            hid_rpt_m @ self.W1.t() + hid_rpt_w @ self.W2.t() +
            self.b1) @ self.V + self.bv  # row, all states influence one state
        before_norm = a_mx.cpu().detach().numpy()  ## save
        a_mx = F.normalize(a_mx, p=2, dim=1, eps=1e-12, out=None)
        after_norm = a_mx.cpu().detach().numpy()  ## save
        r_l = []
        r_long_l = []
        h_mids = orig_x
        for i in range(self.m):
            h_tmp = h_mids[:, :, i:i + 1].permute(0, 2, 1).contiguous()
            r = self.conv(h_tmp)  # [32, 10/k, 1]
            r_long = self.conv_long(h_tmp)
            r_l.append(r)
            r_long_l.append(r_long)
        r_l = torch.stack(r_l, dim=1)
        r_long_l = torch.stack(r_long_l, dim=1)
        r_l = torch.cat((r_l, r_long_l), -1)
        r_l = r_l.view(r_l.size(0), r_l.size(1), -1)
        r_l = torch.relu(r_l)
        adjs = self.adj.repeat(b, 1)
        adjs = adjs.view(b, self.m, self.m)
        c = torch.sigmoid(a_mx @ self.Wb + self.wb)
        a_mx = adjs * c + a_mx * (1 - c)
        after_norm2 = a_mx.cpu().detach().numpy()  ## save
        adj = a_mx
        x = r_l
        x = F.relu(self.conv1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        out_spatial = F.relu(self.conv2(x, adj))
        out = torch.cat((out_spatial, out_temporal), dim=-1)
        out = self.out(out)
        out = torch.squeeze(out)

        if (self.residual_window > 0):
            z = orig_x[:, -self.residual_window:, :]
            #Step backward # [batch, res_window, m]
            z = z.permute(0, 2, 1).contiguous().view(-1, self.residual_window)
            #[batch*m, res_window]
            z = self.residual(z)
            #[batch*m, 1]
            z = z.view(-1, self.m)
            #[batch, m]
            out = out * self.ratio + z
            #[batch, m]

        return out, None
Example #24
0
class Preprocessor(Module):
    def __init__(
        self,
        normalization_parameters: Dict[str, NormalizationParameters],
        use_gpu: bool,
        typed_output: bool = False,
    ) -> None:
        super(Preprocessor, self).__init__()
        self.normalization_parameters = normalization_parameters
        self.sorted_features, self.sorted_feature_boundaries = (
            self._sort_features_by_normalization())
        self.typed_output = typed_output

        cuda_available = torch.cuda.is_available()
        logger.info("CUDA availability: {}".format(cuda_available))
        if use_gpu and cuda_available:
            logger.info("Using GPU: GPU requested and available.")
            self.use_gpu = True
            self.dtype = torch.cuda.FloatTensor
        else:
            logger.info("NOT Using GPU: GPU not requested or not available.")
            self.use_gpu = False
            self.dtype = torch.FloatTensor

        # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net,
        # We need to make tensors for every numeric literal
        self.zero_tensor = Parameter(torch.tensor([0.0]).type(self.dtype),
                                     requires_grad=False)
        self.one_tensor = Parameter(torch.tensor([1.0]).type(self.dtype),
                                    requires_grad=False)
        self.one_half_tensor = Parameter(torch.tensor([0.5]).type(self.dtype),
                                         requires_grad=False)
        self.one_hundredth_tensor = Parameter(torch.tensor([0.01]).type(
            self.dtype),
                                              requires_grad=False)
        self.negative_one_tensor = Parameter(torch.tensor([-1.0
                                                           ]).type(self.dtype),
                                             requires_grad=False)
        self.missing_tensor = Parameter(torch.tensor([MISSING_VALUE
                                                      ]).type(self.dtype),
                                        requires_grad=False)
        self.min_tensor = Parameter(torch.tensor([-1e20]).type(self.dtype),
                                    requires_grad=False)
        self.max_tensor = Parameter(torch.tensor([1e20]).type(self.dtype),
                                    requires_grad=False)
        self.epsilon_tensor = Parameter(torch.tensor([EPS]).type(self.dtype),
                                        requires_grad=False)

        feature_starts = self._get_type_boundaries()
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[
                        self.sorted_features[j]]
                    func = getattr(self, "_create_parameters_" + feature_type)
                    func(j, norm_params)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                func = getattr(self, "_create_parameters_" + feature_type)
                func(begin_index, norm_params)

    def input_prototype(self):
        return rlt.FeatureVector(
            float_features=torch.randn(1, len(self.normalization_parameters)))

    def forward(self, input) -> torch.FloatTensor:
        """ Preprocess the input matrix
        :param input tensor
        """
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)
        if isinstance(input, rlt.FeatureVector):
            input = input.float_features.type(self.dtype)

        # ONNX doesn't support != yet
        not_missing_input = (self.one_tensor.float() -
                             (input == self.missing_tensor).float())
        feature_starts = self._get_type_boundaries()

        outputs = []
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[
                        self.sorted_features[j]]
                    new_output = self._preprocess_feature_single_column(
                        j, input[:, j:j + 1], norm_params)
                    new_output *= not_missing_input[:, j:j + 1]
                    self._check_preprocessing_output(new_output, [norm_params])
                    outputs.append(new_output)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                new_output = self._preprocess_feature_multi_column(
                    begin_index, input[:, begin_index:end_index], norm_params)
                new_output *= not_missing_input[:, begin_index:end_index]
                self._check_preprocessing_output(new_output, norm_params)
                outputs.append(new_output)

        def wrap(output):
            if self.typed_output:
                return rlt.FeatureVector(float_features=output)
            else:
                return output

        if len(outputs) == 1:
            return wrap(
                torch.clamp(outputs[0], MIN_FEATURE_VALUE, MAX_FEATURE_VALUE))

        return wrap(
            torch.clamp(torch.cat(outputs, dim=1), MIN_FEATURE_VALUE,
                        MAX_FEATURE_VALUE))

    def _preprocess_feature_single_column(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: NormalizationParameters,
    ) -> torch.Tensor:
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)

        feature_type = norm_params.feature_type
        func = getattr(self, "_preprocess_" + feature_type)
        return func(begin_index, input, norm_params)

    def _preprocess_feature_multi_column(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)

        feature_type = norm_params[0].feature_type
        func = getattr(self, "_preprocess_" + feature_type)
        return func(begin_index, input, norm_params)

    def _create_parameters_BINARY(self, begin_index: int,
                                  norm_params: List[NormalizationParameters]):
        pass

    def _preprocess_BINARY(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        # ONNX doesn't support != yet
        return self.one_tensor - (input == self.zero_tensor).float()

    def _create_parameters_PROBABILITY(
            self, begin_index: int,
            norm_params: List[NormalizationParameters]):
        pass

    def _preprocess_PROBABILITY(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        clamped_input = torch.clamp(input, 0.01, 0.99)
        return self.negative_one_tensor * ((
            (self.one_tensor / clamped_input) - self.one_tensor).log())

    def _create_parameters_CONTINUOUS_ACTION(
            self, begin_index: int,
            norm_params: List[NormalizationParameters]):
        self._create_parameter(
            begin_index,
            "min_serving_value",
            torch.Tensor([p.min_value for p in norm_params]).type(self.dtype),
        )
        self._create_parameter(
            begin_index,
            "min_training_value",
            torch.ones(len(norm_params)).type(self.dtype) * -1 + EPS,
        )
        self._create_parameter(
            begin_index,
            "scaling_factor",
            (torch.ones(len(norm_params)).type(self.dtype) - EPS) * 2 /
            torch.tensor([p.max_value - p.min_value
                          for p in norm_params]).type(self.dtype),
        )

    def _preprocess_CONTINUOUS_ACTION(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        min_serving_value = self._fetch_parameter(begin_index,
                                                  "min_serving_value")
        min_training_value = self._fetch_parameter(begin_index,
                                                   "min_training_value")
        scaling_factor = self._fetch_parameter(begin_index, "scaling_factor")
        continuous_action = (
            input - min_serving_value) * scaling_factor + min_training_value
        return torch.clamp(continuous_action, -1 + EPS, 1 - EPS)

    def _create_parameters_CONTINUOUS(
            self, begin_index: int,
            norm_params: List[NormalizationParameters]):
        self._create_parameter(
            begin_index,
            "means",
            torch.Tensor([p.mean for p in norm_params]).type(self.dtype),
        )
        self._create_parameter(
            begin_index,
            "stddevs",
            torch.Tensor([p.stddev for p in norm_params]).type(self.dtype),
        )

    def _preprocess_CONTINUOUS(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        means = self._fetch_parameter(begin_index, "means")
        stddevs = self._fetch_parameter(begin_index, "stddevs")
        continuous_output = (input - means) / stddevs
        return torch.clamp(continuous_output, MIN_FEATURE_VALUE,
                           MAX_FEATURE_VALUE)

    def _create_parameters_BOXCOX(self, begin_index: int,
                                  norm_params: List[NormalizationParameters]):
        self._create_parameter(
            begin_index,
            "shifts",
            torch.Tensor([p.boxcox_shift
                          for p in norm_params]).type(self.dtype),
        )
        for p in norm_params:
            assert (abs(p.boxcox_lambda) >
                    1e-6), "Invalid value for boxcox lambda: " + str(
                        p.boxcox_lambda)
        self._create_parameter(
            begin_index,
            "lambdas",
            torch.Tensor([p.boxcox_lambda
                          for p in norm_params]).type(self.dtype),
        )
        self._create_parameters_CONTINUOUS(begin_index, norm_params)

    def _preprocess_BOXCOX(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        shifts = self._fetch_parameter(begin_index, "shifts")
        lambdas = self._fetch_parameter(begin_index, "lambdas")
        boxcox_output = (
            # We can replace this with a normal pow() call after D8528654 lands
            self._manual_broadcast_matrix_scalar(
                torch.clamp(
                    input + shifts, 1e-6
                ),  # Clamp is necessary to prevent MISSING_VALUE from going to NaN
                lambdas,
                torch.pow,
            ) - self.one_tensor) / lambdas
        return self._preprocess_CONTINUOUS(begin_index, boxcox_output,
                                           norm_params)

    def _create_parameters_QUANTILE(
            self, begin_index: int,
            norm_params: List[NormalizationParameters]):
        F = len(norm_params)

        num_quantiles = torch.tensor([[
            float(len(p.quantiles)) - 1 for p in norm_params
        ]]).type(self.dtype)
        self._create_parameter(begin_index, "num_quantiles", num_quantiles)

        max_num_quantile_boundaries = int(
            torch.max(torch.tensor([len(p.quantiles) for p in norm_params])))
        B = max_num_quantile_boundaries

        # The quantile boundaries is a FxB matrix where B is the max # of boundaries

        # We take advantage of the fact that if the value is >= the max
        # quantile boundary it automatically gets a 1.0 to repeat the max quantile
        # so that we guarantee a square matrix.

        # We project the quantiles boundaries to 3d and create a 1xFxB tensor
        quantile_boundaries = torch.zeros(
            [1, len(norm_params),
             max_num_quantile_boundaries]).type(self.dtype)
        max_quantile_boundaries = torch.zeros([1, len(norm_params)
                                               ]).type(self.dtype)
        min_quantile_boundaries = torch.zeros([1, len(norm_params)
                                               ]).type(self.dtype)
        for i, p in enumerate(norm_params):
            quantile_boundaries[0, i, :] = p.quantiles[-1]
            quantile_boundaries[0, i, 0:len(p.quantiles)] = torch.tensor(
                p.quantiles).type(self.dtype)
            max_quantile_boundaries[0, i] = max(p.quantiles)
            min_quantile_boundaries[0, i] = min(p.quantiles)

        quantile_boundaries = quantile_boundaries.type(self.dtype)
        max_quantile_boundaries = max_quantile_boundaries.type(self.dtype)
        min_quantile_boundaries = min_quantile_boundaries.type(self.dtype)

        self._create_parameter(begin_index, "quantile_boundaries",
                               quantile_boundaries)
        self._create_parameter(begin_index, "max_quantile_boundaries",
                               max_quantile_boundaries)
        self._create_parameter(begin_index, "min_quantile_boundaries",
                               min_quantile_boundaries)
        self._create_parameter(
            begin_index,
            "quantile_boundary_mask",
            torch.ones([1, F, B]).type(self.dtype),
        )

    def _preprocess_QUANTILE(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        """
        Replace the value with it's percentile in the range [0,1].

        This preprocesses several features in a single step by putting the
        quantile boundaries in the third dimension and broadcasting.

        The input is a JxF matrix where J is the batch size and F is the # of features.
        """

        # The number of quantiles is a 1xF matrix
        num_quantiles = self._fetch_parameter(begin_index, "num_quantiles")

        quantile_boundaries = self._fetch_parameter(begin_index,
                                                    "quantile_boundaries")
        max_quantile_boundaries = self._fetch_parameter(
            begin_index, "max_quantile_boundaries")
        min_quantile_boundaries = self._fetch_parameter(
            begin_index, "min_quantile_boundaries")

        # Add a third dimension and repeat to create a JxFxB matrix, where the
        # inputs are repeated B times in the third dimension.  We need to
        # do this because we can't broadcast both operands in different
        # dimensions in the same operation.

        # repeat doesn't work yet, so * by a mask
        mask = self._fetch_parameter(begin_index, "quantile_boundary_mask")
        expanded_inputs = input.unsqueeze(2) * mask

        input_greater_than_or_equal_to = (expanded_inputs >=
                                          quantile_boundaries).float()

        input_less_than = (expanded_inputs < quantile_boundaries).float()
        set_to_max = (input >= max_quantile_boundaries).float()
        set_to_min = (input <= min_quantile_boundaries).float()
        min_or_max = (set_to_min + set_to_max).float()
        interpolate = (min_or_max < self.one_hundredth_tensor).float()
        interpolate_left, _ = torch.max(
            (input_greater_than_or_equal_to * quantile_boundaries) +
            (input_less_than * self.min_tensor),
            dim=2,
        )
        interpolate_right, _ = torch.min(
            (input_less_than * quantile_boundaries) +
            (input_greater_than_or_equal_to * self.max_tensor),
            dim=2,
        )

        # This assumes that we need to interpolate and computes the value.
        # If we don't need to interpolate, this will be some bogus value, but it
        # will be multiplied by 0 so no big deal.
        left_start = torch.sum(input_greater_than_or_equal_to,
                               dim=2) - self.one_tensor
        interpolated_values = ((
            left_start + ((input - interpolate_left) / (
                (interpolate_right + self.epsilon_tensor) - interpolate_left
            )  # Add a small amount to interpolate_right to avoid div-0
                          )) / num_quantiles).float()
        return set_to_max + (interpolate * interpolated_values).float()

    def _create_parameters_ENUM(self, begin_index: int,
                                norm_params: NormalizationParameters):
        self._create_parameter(
            begin_index,
            "enum_values",
            torch.Tensor(norm_params.possible_values).unsqueeze(0).type(
                self.dtype),
        )

    def _preprocess_ENUM(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: NormalizationParameters,
    ) -> torch.Tensor:
        enum_values = self._fetch_parameter(begin_index, "enum_values")
        return (input == enum_values).float()

    def _sort_features_by_normalization(self):
        """
        Helper function to return a sorted list from a normalization map.
        Also returns the starting index for each feature type"""
        # Sort features by feature type
        sorted_features = []
        feature_starts = []
        assert isinstance(list(self.normalization_parameters.keys())[0],
                          int), "Normalization Parameters need to be int"
        for feature_type in FEATURE_TYPES:
            feature_starts.append(len(sorted_features))
            for feature in sorted(self.normalization_parameters.keys()):
                norm = self.normalization_parameters[feature]
                if norm.feature_type == feature_type:
                    sorted_features.append(feature)
        return sorted_features, feature_starts

    def _get_type_boundaries(self) -> List[int]:
        feature_starts = []
        on_feature_type = -1
        for i, feature in enumerate(self.sorted_features):
            feature_type = self.normalization_parameters[feature].feature_type
            feature_type_index = FEATURE_TYPES.index(feature_type)
            assert (feature_type_index >= on_feature_type
                    ), "Features are not sorted by feature type!"
            while feature_type_index > on_feature_type:
                feature_starts.append(i)
                on_feature_type += 1
        while on_feature_type < len(FEATURE_TYPES):
            feature_starts.append(len(self.sorted_features))
            on_feature_type += 1
        return feature_starts

    def _create_parameter(self, begin_index: int, name: str,
                          t: torch.Tensor) -> Parameter:
        p = Parameter(t, requires_grad=False)
        setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p)
        return p

    def _fetch_parameter(self, begin_index: int, name: str) -> Parameter:
        return getattr(self,
                       "_auto_parameter_" + str(begin_index) + "_" + name)

    def _manual_broadcast_matrix_scalar(self, t1: torch.Tensor,
                                        s1: torch.Tensor, fn) -> torch.Tensor:
        # Some ONNX ops don't support broadcasting so we need to do some matrix magic
        return fn(t1, (t1 * self.zero_tensor) + s1).float()

    def _manual_broadcast_column_vec_row_vec(self, t1: torch.Tensor,
                                             t2: torch.Tensor,
                                             fn) -> torch.Tensor:
        # Some ONNX ops don't support broadcasting so we need to do some matrix magic
        t2_ones = t2 / t2
        t1_mask = t1.mm(t2_ones)

        return fn(t1_mask, t2).float()

    def _check_preprocessing_output(self, batch, norm_params):
        """
        Check that preprocessed features fall within range of valid output.
        :param batch: torch tensor
        :param norm_params: list of normalization parameters
        """
        feature_type = norm_params[0].feature_type
        min_value, max_value = batch.min(), batch.max()
        if feature_type == "CONTINUOUS":
            # Continuous features may be in range (-inf, inf)
            pass
        elif float(max_value) > MAX_FEATURE_VALUE:
            raise Exception(
                "A {} feature type has max value {} which is > than accepted post pre-processing max of {}"
                .format(feature_type, max_value, MAX_FEATURE_VALUE))
        elif float(min_value) < MIN_FEATURE_VALUE:
            raise Exception(
                "A {} feature type has min value {} which is < accepted post pre-processing min of {}"
                .format(feature_type, min_value, MIN_FEATURE_VALUE))
Example #25
0
    def __init__(
        self,
        normalization_parameters: Dict[str, NormalizationParameters],
        use_gpu: bool,
        typed_output: bool = False,
    ) -> None:
        super(Preprocessor, self).__init__()
        self.normalization_parameters = normalization_parameters
        self.sorted_features, self.sorted_feature_boundaries = (
            self._sort_features_by_normalization())
        self.typed_output = typed_output

        cuda_available = torch.cuda.is_available()
        logger.info("CUDA availability: {}".format(cuda_available))
        if use_gpu and cuda_available:
            logger.info("Using GPU: GPU requested and available.")
            self.use_gpu = True
            self.dtype = torch.cuda.FloatTensor
        else:
            logger.info("NOT Using GPU: GPU not requested or not available.")
            self.use_gpu = False
            self.dtype = torch.FloatTensor

        # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net,
        # We need to make tensors for every numeric literal
        self.zero_tensor = Parameter(torch.tensor([0.0]).type(self.dtype),
                                     requires_grad=False)
        self.one_tensor = Parameter(torch.tensor([1.0]).type(self.dtype),
                                    requires_grad=False)
        self.one_half_tensor = Parameter(torch.tensor([0.5]).type(self.dtype),
                                         requires_grad=False)
        self.one_hundredth_tensor = Parameter(torch.tensor([0.01]).type(
            self.dtype),
                                              requires_grad=False)
        self.negative_one_tensor = Parameter(torch.tensor([-1.0
                                                           ]).type(self.dtype),
                                             requires_grad=False)
        self.missing_tensor = Parameter(torch.tensor([MISSING_VALUE
                                                      ]).type(self.dtype),
                                        requires_grad=False)
        self.min_tensor = Parameter(torch.tensor([-1e20]).type(self.dtype),
                                    requires_grad=False)
        self.max_tensor = Parameter(torch.tensor([1e20]).type(self.dtype),
                                    requires_grad=False)
        self.epsilon_tensor = Parameter(torch.tensor([EPS]).type(self.dtype),
                                        requires_grad=False)

        feature_starts = self._get_type_boundaries()
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[
                        self.sorted_features[j]]
                    func = getattr(self, "_create_parameters_" + feature_type)
                    func(j, norm_params)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                func = getattr(self, "_create_parameters_" + feature_type)
                func(begin_index, norm_params)
Example #26
0
class TopkHierarchicalMultiheadAttention(nn.Module):
    """Multi-headed attention.

    See "Attention Is All You Need" for more details.
    """
    def __init__(self,
                 embed_dim,
                 num_heads,
                 kdim=None,
                 vdim=None,
                 dropout=0.,
                 bias=True,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        if self.qkv_same_dim:
            self.in_proj_weight = Parameter(
                torch.Tensor(3 * embed_dim, embed_dim))
        else:
            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))

        if bias:
            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
        else:
            self.register_parameter('in_proj_bias', None)

        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False

    def prepare_for_onnx_export_(self):
        self.onnx_trace = True

    def reset_parameters(self):
        if self.qkv_same_dim:
            nn.init.xavier_uniform_(self.in_proj_weight)
        else:
            nn.init.xavier_uniform_(self.k_proj_weight)
            nn.init.xavier_uniform_(self.v_proj_weight)
            nn.init.xavier_uniform_(self.q_proj_weight)

        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.in_proj_bias is not None:
            nn.init.constant_(self.in_proj_bias, 0.)
            nn.init.constant_(self.out_proj.bias, 0.)
        if self.bias_k is not None:
            nn.init.xavier_normal_(self.bias_k)
        if self.bias_v is not None:
            nn.init.xavier_normal_(self.bias_v)

    def forward(self,
                query,
                key,
                value,
                hierarchical_attn,
                key_padding_mask=None,
                incremental_state=None,
                need_weights=True,
                static_kv=False,
                attn_mask=None):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Timesteps can be masked by supplying a T x T mask in the
        `attn_mask` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """

        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
        kv_same = key.data_ptr() == value.data_ptr()

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if 'prev_key' in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert kv_same and not qkv_same
                    key = value = None
        else:
            saved_state = None

        if qkv_same:
            # self-attention
            q, k, v = self.in_proj_qkv(query)
        elif kv_same:
            # encoder-decoder attention
            q = self.in_proj_q(query)
            if key is None:
                assert value is None
                k = v = None
            else:
                k = self.in_proj_k(key)
                v = self.in_proj_v(key)

        else:
            q = self.in_proj_q(query)
            k = self.in_proj_k(key)
            v = self.in_proj_v(value)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat([
                    key_padding_mask,
                    key_padding_mask.new_zeros(key_padding_mask.size(0), 1)
                ],
                                             dim=1)

        q = q.contiguous().view(tgt_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)
        if k is not None:
            k = k.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)
        if v is not None:
            v = v.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if 'prev_key' in saved_state:
                prev_key = saved_state['prev_key'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    k = prev_key
                else:
                    k = torch.cat((prev_key, k), dim=1)
            if 'prev_value' in saved_state:
                prev_value = saved_state['prev_value'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    v = prev_value
                else:
                    v = torch.cat((prev_value, v), dim=1)
            saved_state['prev_key'] = k.view(bsz, self.num_heads, -1,
                                             self.head_dim)
            saved_state['prev_value'] = v.view(bsz, self.num_heads, -1,
                                               self.head_dim)

            self._set_input_buffer(incremental_state, saved_state)

        src_len = k.size(1)

        # This is part of a workaround to get around fork/join parallelism
        # not supporting Optional types.
        if key_padding_mask is not None and key_padding_mask.shape == torch.Size(
            []):
            key_padding_mask = None

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        if self.add_zero_attn:
            src_len += 1
            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])],
                          dim=1)
            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])],
                          dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat([
                    key_padding_mask,
                    torch.zeros(key_padding_mask.size(0),
                                1).type_as(key_padding_mask)
                ],
                                             dim=1)

        attn_weights = torch.bmm(q, k.transpose(1, 2))
        assert list(
            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
            if self.onnx_trace:
                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
            attn_weights += attn_mask

        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            if self.onnx_trace:
                attn_weights = torch.where(
                    key_padding_mask.unsqueeze(1).unsqueeze(2),
                    torch.Tensor([-2**32 + 1]),
                    attn_weights.float()).type_as(attn_weights)
            else:
                attn_weights = attn_weights.float().masked_fill(
                    key_padding_mask.unsqueeze(1).unsqueeze(2),
                    -2**32 + 1,
                ).type_as(attn_weights)  # FP16 support: cast to float and back
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        if hierarchical_attn is not None:
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights * hierarchical_attn.unsqueeze(1)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        attn_weights = utils.softmax(
            attn_weights,
            dim=-1,
            onnx_trace=self.onnx_trace,
        ).type_as(attn_weights)
        attn_weights = F.dropout(
            attn_weights, p=self.dropout,
            training=self.training)  # (bsz * self.num_heads, tgt_len, src_len)

        # select topk
        topk_value, topk_indice = torch.kthvalue(attn_weights,
                                                 attn_weights.size(-1) - 10,
                                                 dim=-1)
        topk_mask = torch.ge(
            attn_weights,
            topk_value.unsqueeze(-1).repeat(1, 1, attn_weights.size(-1)))
        attn = torch.bmm(attn_weights * topk_mask.float(), v)

        # attn = torch.bmm(attn_weights, v)
        assert list(
            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        if (self.onnx_trace and attn.size(1) == 1):
            # when ONNX tracing a single decoder step (sequence length == 1)
            # the transpose is a no-op copy before view, thus unnecessary
            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
        else:
            attn = attn.transpose(0,
                                  1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)

        if need_weights:
            # average attention weights over heads
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights.sum(dim=1) / self.num_heads
        else:
            attn_weights = None

        return attn, attn_weights

    def in_proj_qkv(self, query):
        return self._in_proj(query).chunk(3, dim=-1)

    def in_proj_q(self, query):
        if self.qkv_same_dim:
            return self._in_proj(query, end=self.embed_dim)
        else:
            bias = self.in_proj_bias
            if bias is not None:
                bias = bias[:self.embed_dim]
            return F.linear(query, self.q_proj_weight, bias)

    def in_proj_k(self, key):
        if self.qkv_same_dim:
            return self._in_proj(key,
                                 start=self.embed_dim,
                                 end=2 * self.embed_dim)
        else:
            weight = self.k_proj_weight
            bias = self.in_proj_bias
            if bias is not None:
                bias = bias[self.embed_dim:2 * self.embed_dim]
            return F.linear(key, weight, bias)

    def in_proj_v(self, value):
        if self.qkv_same_dim:
            return self._in_proj(value, start=2 * self.embed_dim)
        else:
            weight = self.v_proj_weight
            bias = self.in_proj_bias
            if bias is not None:
                bias = bias[2 * self.embed_dim:]
            return F.linear(value, weight, bias)

    def _in_proj(self, input, start=0, end=None):
        weight = self.in_proj_weight
        bias = self.in_proj_bias
        weight = weight[start:end, :]
        if bias is not None:
            bias = bias[start:end]
        return F.linear(input, weight, bias)

    def reorder_incremental_state(self, incremental_state, new_order):
        """Reorder buffered internal state (for incremental generation)."""
        input_buffer = self._get_input_buffer(incremental_state)
        if input_buffer is not None:
            for k in input_buffer.keys():
                input_buffer[k] = input_buffer[k].index_select(0, new_order)
            self._set_input_buffer(incremental_state, input_buffer)

    def _get_input_buffer(self, incremental_state):
        return utils.get_incremental_state(
            self,
            incremental_state,
            'attn_state',
        ) or {}

    def _set_input_buffer(self, incremental_state, buffer):
        utils.set_incremental_state(
            self,
            incremental_state,
            'attn_state',
            buffer,
        )
Example #27
0
class MyGRU(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 layers=1,
                 bidirectional=False,
                 initpara=True,
                 attn_decode=False,
                 post_size=None):
        super(MyGRU, self).__init__()

        self.input_size, self.hidden_size, self.layers, self.bidirectional = \
                input_size, hidden_size, layers, bidirectional
        self.GRU = GRU(input_size,
                       hidden_size,
                       layers,
                       bidirectional=bidirectional)
        self.initpara = initpara
        if initpara:
            if bidirectional:
                self.h_init = Parameter(
                    torch.Tensor(2 * layers, 1, hidden_size))
            else:
                self.h_init = Parameter(torch.Tensor(layers, 1, hidden_size))
        self.reset_parameters()

        if attn_decode:
            self.attn_query = nn.Linear(hidden_size, post_size)

    def reset_parameters(self):
        if self.initpara:
            stdv = 1.0 / math.sqrt(self.hidden_size)
            self.h_init.data.uniform_(-stdv, stdv)

    def getInitialParameter(self, batch_size):
        return self.h_init.repeat(1, batch_size, 1)

    def forward(self,
                incoming,
                length,
                h_init=None,
                need_h=False,
                attn_decode=False,
                post=None,
                post_length=None):
        if not attn_decode:
            sen_sorted, length_sorted, memo = sortSequence(incoming, length)
            left_batch_size = sen_sorted.shape[-2]
            sen_packed = pack_padded_sequence(sen_sorted, length_sorted)
            if h_init is None:
                h_init = self.getInitialParameter(left_batch_size)
            else:
                h_shape = h_init.size()
                h_init = sortSequenceByMemo(h_init, memo)
                h_init = h_init.reshape(h_shape)
                if h_init.dim() < 3:
                    h_init = torch.unsqueeze(h_init, 0)

            h, h_n = self.GRU(sen_packed, h_init)
            h_n = h_n.transpose(0, 1).reshape(left_batch_size, -1)
            h_n = revertSequence(h_n, memo)
            if need_h:
                h = pad_packed_sequence(h)[0]
                h = revertSequence(h, memo, True)
                return h, h_n
            else:
                return h_n

        else:
            batch_size = incoming.shape[1]
            seqlen = incoming.shape[0]
            if h_init is None:
                h_init = self.getInitialParameter(batch_size)
            else:
                h_init = torch.unsqueeze(h_init, 0)
            h_now = h_init[0]
            hs = []
            attn_weights = []

            for i in range(seqlen):
                query = self.attn_query(h_now)
                attn_weight = maskedSoftmax(
                    (query.unsqueeze(0) * post).sum(-1), post_length)
                context = (attn_weight.unsqueeze(-1) * post).sum(0)
                h_now = self.cell_forward(
                    torch.cat([incoming[i], context], dim=-1), h_now) * Tensor(
                        (length >
                         np.ones(batch_size) * i).astype(float)).unsqueeze(-1)

                hs.append(h_now)
                attn_weights.append(attn_weight)

            return torch.stack(hs), h_now

    def cell_forward(self, incoming, h):
        return F_GRUCell(incoming, h, self.GRU.weight_ih_l0,
                         self.GRU.weight_hh_l0, self.GRU.bias_ih_l0,
                         self.GRU.bias_hh_l0)
    def __init__(
        self,
        in_channels: Union[int, Tuple[int, int]],
        out_channels: Optional[int],
        in_edge_channels: int = None,
        aggr: str = "add",
        skip_linear: str = False,
        directed_msg: bool = True,
        heads: int = 1,
        attention: bool = False,
        attention_type: str = "additive",
        l2_normalize: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        kwargs.setdefault('aggr', aggr)
        super().__init__(node_dim=0, **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.in_edge_channels = in_edge_channels
        self.aggr = aggr
        self.skip_linear = skip_linear
        self.directed_msg = directed_msg
        self.heads = heads
        self.attention = attention
        self.attention_type = attention_type
        self.normalize_l2 = l2_normalize

        if isinstance(in_channels, int):
            in_channels = (in_channels, in_channels)

        if self.directed_msg:
            self.lin_msg = Linear(in_channels[0],
                                  out_channels * self.heads,
                                  bias=bias)
        else:
            self.lin_msg = Linear(in_channels[0],
                                  out_channels * self.heads,
                                  bias=bias)
            self.lin_msg_i = Linear(in_channels[0],
                                    out_channels * self.heads,
                                    bias=bias)

        if self.skip_linear or self.in_channels != self.out_channels:
            self.lin_self = Linear(in_channels[1], out_channels, bias=bias)
        else:
            self.lin_self = torch.nn.Identity()

        if self.in_edge_channels is not None:
            self.lin_edge = Linear(in_edge_channels,
                                   out_channels * self.heads,
                                   bias=bias)

        # TODO: A general torch_geometric.nn.AttentionLayer
        if self.attention:
            if self.attention_type == 'additive':
                self.att_msg = Parameter(
                    torch.Tensor(1, self.heads, self.out_channels))
            elif self.attention_type == 'dot_product':
                self.scaler = torch.sqrt(
                    torch.tensor(out_channels, dtype=torch.float))
            else:
                raise ValueError(
                    f"Attention type '{self.attention_type}' not supported")

        self.reset_parameters()
Example #29
0
class MultiheadAttention(nn.Module):
    """Multi-headed attention.

    See "Attention Is All You Need" for more details.
    """
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False

    def prepare_for_onnx_export_(self):
        self.onnx_trace = True

    def reset_parameters(self):
        if self.qkv_same_dim:
            # Empirically observed the convergence to be much better with
            # the scaled initialization
            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
        else:
            nn.init.xavier_uniform_(self.k_proj.weight)
            nn.init.xavier_uniform_(self.v_proj.weight)
            nn.init.xavier_uniform_(self.q_proj.weight)

        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.out_proj.bias is not None:
            nn.init.constant_(self.out_proj.bias, 0.0)
        if self.bias_k is not None:
            nn.init.xavier_normal_(self.bias_k)
        if self.bias_v is not None:
            nn.init.xavier_normal_(self.bias_v)

    def forward(
        self,
        query,
        key: Optional[Tensor],
        value: Optional[Tensor],
        key_padding_mask: Optional[Tensor] = None,
        incremental_state: Optional[Dict[str, Dict[str,
                                                   Optional[Tensor]]]] = None,
        need_weights: bool = True,
        static_kv: bool = False,
        attn_mask: Optional[Tensor] = None,
        before_softmax: bool = False,
        need_head_weights: bool = False,
    ) -> Tuple[Tensor, Optional[Tensor]]:
        """Input shape: Time x Batch x Channel

        Args:
            key_padding_mask (ByteTensor, optional): mask to exclude
                keys that are pads, of shape `(batch, src_len)`, where
                padding elements are indicated by 1s.
            need_weights (bool, optional): return the attention weights,
                averaged over heads (default: False).
            attn_mask (ByteTensor, optional): typically used to
                implement causal attention, where the mask prevents the
                attention from looking forward in time (default: None).
            before_softmax (bool, optional): return the raw attention
                weights and values before the attention softmax.
            need_head_weights (bool, optional): return the attention
                weights for each head. Implies *need_weights*. Default:
                return the average attention weights over all heads.
        """
        if need_head_weights:
            need_weights = True

        is_tpu = query.device.type == "xla"

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]

        if (not self.onnx_trace
                and not is_tpu  # don't use PyTorch version on TPUs
                and incremental_state is None and not static_kv
                # A workaround for quantization to work. Otherwise JIT compilation
                # treats bias in linear module as method.
                and not torch.jit.is_scripting()):
            assert key is not None and value is not None
            return F.multi_head_attention_forward(
                query,
                key,
                value,
                self.embed_dim,
                self.num_heads,
                torch.empty([0]),
                torch.cat(
                    (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
                self.bias_k,
                self.bias_v,
                self.add_zero_attn,
                self.dropout_module.p,
                self.out_proj.weight,
                self.out_proj.bias,
                self.training or self.dropout_module.apply_during_inference,
                key_padding_mask,
                need_weights,
                attn_mask,
                use_separate_proj_weight=True,
                q_proj_weight=self.q_proj.weight,
                k_proj_weight=self.k_proj.weight,
                v_proj_weight=self.v_proj.weight,
            )

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if saved_state is not None and "prev_key" in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert self.encoder_decoder_attention and not self.self_attention
                    key = value = None
        else:
            saved_state = None

        if self.self_attention:
            q = self.q_proj(query)
            k = self.k_proj(query)
            v = self.v_proj(query)
        elif self.encoder_decoder_attention:
            # encoder-decoder attention
            q = self.q_proj(query)
            if key is None:
                assert value is None
                k = v = None
            else:
                k = self.k_proj(key)
                v = self.v_proj(key)

        else:
            assert key is not None and value is not None
            q = self.q_proj(query)
            k = self.k_proj(key)
            v = self.v_proj(value)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat(
                    [
                        key_padding_mask,
                        key_padding_mask.new_zeros(key_padding_mask.size(0),
                                                   1),
                    ],
                    dim=1,
                )

        q = (q.contiguous().view(tgt_len, bsz * self.num_heads,
                                 self.head_dim).transpose(0, 1))
        if k is not None:
            k = (k.contiguous().view(-1, bsz * self.num_heads,
                                     self.head_dim).transpose(0, 1))
        if v is not None:
            v = (v.contiguous().view(-1, bsz * self.num_heads,
                                     self.head_dim).transpose(0, 1))

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if "prev_key" in saved_state:
                _prev_key = saved_state["prev_key"]
                assert _prev_key is not None
                prev_key = _prev_key.view(bsz * self.num_heads, -1,
                                          self.head_dim)
                if static_kv:
                    k = prev_key
                else:
                    assert k is not None
                    k = torch.cat([prev_key, k], dim=1)
            if "prev_value" in saved_state:
                _prev_value = saved_state["prev_value"]
                assert _prev_value is not None
                prev_value = _prev_value.view(bsz * self.num_heads, -1,
                                              self.head_dim)
                if static_kv:
                    v = prev_value
                else:
                    assert v is not None
                    v = torch.cat([prev_value, v], dim=1)
            prev_key_padding_mask: Optional[Tensor] = None
            if "prev_key_padding_mask" in saved_state:
                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
            assert k is not None and v is not None
            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
                key_padding_mask=key_padding_mask,
                prev_key_padding_mask=prev_key_padding_mask,
                batch_size=bsz,
                src_len=k.size(1),
                static_kv=static_kv,
            )

            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1,
                                             self.head_dim)
            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1,
                                               self.head_dim)
            saved_state["prev_key_padding_mask"] = key_padding_mask
            # In this branch incremental_state is never None
            assert incremental_state is not None
            incremental_state = self._set_input_buffer(incremental_state,
                                                       saved_state)
        assert k is not None
        src_len = k.size(1)

        # This is part of a workaround to get around fork/join parallelism
        # not supporting Optional types.
        if key_padding_mask is not None and key_padding_mask.dim() == 0:
            key_padding_mask = None

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        if self.add_zero_attn:
            assert v is not None
            src_len += 1
            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])],
                          dim=1)
            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])],
                          dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat(
                    [
                        key_padding_mask,
                        torch.zeros(key_padding_mask.size(0),
                                    1).type_as(key_padding_mask),
                    ],
                    dim=1,
                )

        attn_weights = torch.bmm(q, k.transpose(1, 2))
        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
                                              bsz)

        assert list(
            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
            if self.onnx_trace:
                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
            attn_weights += attn_mask

        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            if not is_tpu:
                attn_weights = attn_weights.masked_fill(
                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
                    float("-inf"),
                )
            else:
                attn_weights = attn_weights.transpose(0, 2)
                attn_weights = attn_weights.masked_fill(
                    key_padding_mask, float("-inf"))
                attn_weights = attn_weights.transpose(0, 2)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        if before_softmax:
            return attn_weights, v

        attn_weights_float = utils.softmax(attn_weights,
                                           dim=-1,
                                           onnx_trace=self.onnx_trace)
        attn_weights = attn_weights_float.type_as(attn_weights)
        attn_probs = self.dropout_module(attn_weights)

        assert v is not None
        attn = torch.bmm(attn_probs, v)
        assert list(
            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        if self.onnx_trace and attn.size(1) == 1:
            # when ONNX tracing a single decoder step (sequence length == 1)
            # the transpose is a no-op copy before view, thus unnecessary
            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
        else:
            attn = attn.transpose(0,
                                  1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)
        attn_weights: Optional[Tensor] = None
        if need_weights:
            attn_weights = attn_weights_float.view(bsz, self.num_heads,
                                                   tgt_len,
                                                   src_len).transpose(1, 0)
            if not need_head_weights:
                # average attention weights over heads
                attn_weights = attn_weights.mean(dim=0)

        return attn, attn_weights

    @staticmethod
    def _append_prev_key_padding_mask(
        key_padding_mask: Optional[Tensor],
        prev_key_padding_mask: Optional[Tensor],
        batch_size: int,
        src_len: int,
        static_kv: bool,
    ) -> Optional[Tensor]:
        # saved key padding masks have shape (bsz, seq_len)
        if prev_key_padding_mask is not None and static_kv:
            new_key_padding_mask = prev_key_padding_mask
        elif prev_key_padding_mask is not None and key_padding_mask is not None:
            new_key_padding_mask = torch.cat(
                [prev_key_padding_mask.float(),
                 key_padding_mask.float()],
                dim=1)
        # During incremental decoding, as the padding token enters and
        # leaves the frame, there will be a time when prev or current
        # is None
        elif prev_key_padding_mask is not None:
            filler = torch.zeros(
                (batch_size, src_len - prev_key_padding_mask.size(1)),
                device=prev_key_padding_mask.device,
            )
            new_key_padding_mask = torch.cat(
                [prev_key_padding_mask.float(),
                 filler.float()], dim=1)
        elif key_padding_mask is not None:
            filler = torch.zeros(
                (batch_size, src_len - key_padding_mask.size(1)),
                device=key_padding_mask.device,
            )
            new_key_padding_mask = torch.cat(
                [filler.float(), key_padding_mask.float()], dim=1)
        else:
            new_key_padding_mask = prev_key_padding_mask
        return new_key_padding_mask

    @torch.jit.export
    def reorder_incremental_state(
        self,
        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
        new_order: Tensor,
    ):
        """Reorder buffered internal state (for incremental generation)."""
        input_buffer = self._get_input_buffer(incremental_state)
        if input_buffer is not None:
            for k in input_buffer.keys():
                input_buffer_k = input_buffer[k]
                if input_buffer_k is not None:
                    if self.encoder_decoder_attention and input_buffer_k.size(
                            0) == new_order.size(0):
                        break
                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
            incremental_state = self._set_input_buffer(incremental_state,
                                                       input_buffer)
        return incremental_state

    def _get_input_buffer(
        self, incremental_state: Optional[Dict[str, Dict[str,
                                                         Optional[Tensor]]]]
    ) -> Dict[str, Optional[Tensor]]:
        result = self.get_incremental_state(incremental_state, "attn_state")
        if result is not None:
            return result
        else:
            empty_result: Dict[str, Optional[Tensor]] = {}
            return empty_result

    def _set_input_buffer(
        self,
        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
        buffer: Dict[str, Optional[Tensor]],
    ):
        return self.set_incremental_state(incremental_state, "attn_state",
                                          buffer)

    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int,
                          bsz: int):
        return attn_weights

    def upgrade_state_dict_named(self, state_dict, name):
        prefix = name + "." if name != "" else ""
        items_to_add = {}
        keys_to_remove = []
        for k in state_dict.keys():
            if k.endswith(prefix + "in_proj_weight"):
                # in_proj_weight used to be q + k + v with same dimensions
                dim = int(state_dict[k].shape[0] / 3)
                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim:2 *
                                                                       dim]
                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 *
                                                                       dim:]

                keys_to_remove.append(k)

                k_bias = prefix + "in_proj_bias"
                if k_bias in state_dict.keys():
                    dim = int(state_dict[k].shape[0] / 3)
                    items_to_add[prefix +
                                 "q_proj.bias"] = state_dict[k_bias][:dim]
                    items_to_add[prefix +
                                 "k_proj.bias"] = state_dict[k_bias][dim:2 *
                                                                     dim]
                    items_to_add[prefix +
                                 "v_proj.bias"] = state_dict[k_bias][2 * dim:]

                    keys_to_remove.append(prefix + "in_proj_bias")

        for k in keys_to_remove:
            del state_dict[k]

        for key, value in items_to_add.items():
            state_dict[key] = value
Example #30
0
class TopKPooling(torch.nn.Module):
    r""":math:`\mathrm{top}_k` pooling operator from the `"Graph U-Nets"
    <https://arxiv.org/abs/1905.05178>`_, `"Towards Sparse
    Hierarchical Graph Classifiers" <https://arxiv.org/abs/1811.01287>`_
    and `"Understanding Attention and Generalization in Graph Neural
    Networks" <https://arxiv.org/abs/1905.02850>`_ papers
    if min_score :math:`\tilde{\alpha}` is None:
        .. math::
            \mathbf{y} &= \frac{\mathbf{X}\mathbf{p}}{\| \mathbf{p} \|}
            \mathbf{i} &= \mathrm{top}_k(\mathbf{y})
            \mathbf{X}^{\prime} &= (\mathbf{X} \odot
            \mathrm{tanh}(\mathbf{y}))_{\mathbf{i}}
            \mathbf{A}^{\prime} &= \mathbf{A}_{\mathbf{i},\mathbf{i}}
    if min_score :math:`\tilde{\alpha}` is a value in [0, 1]:
        .. math::
            \mathbf{y} &= \mathrm{softmax}(\mathbf{X}\mathbf{p})
            \mathbf{i} &= \mathbf{y}_i > \tilde{\alpha}
            \mathbf{X}^{\prime} &= (\mathbf{X} \odot \mathbf{y})_{\mathbf{i}}
            \mathbf{A}^{\prime} &= \mathbf{A}_{\mathbf{i},\mathbf{i}},
    where nodes are dropped based on a learnable projection score
    :math:`\mathbf{p}`.
    Args:
        in_channels (int): Size of each input sample.
        ratio (float): Graph pooling ratio, which is used to compute
            :math:`k = \lceil \mathrm{ratio} \cdot N \rceil`.
            This value is ignored if min_score is not None.
            (default: :obj:`0.5`)
        min_score (float, optional): Minimal node score :math:`\tilde{\alpha}`
            which is used to compute indices of pooled nodes
            :math:`\mathbf{i} = \mathbf{y}_i > \tilde{\alpha}`.
            When this value is not :obj:`None`, the :obj:`ratio` argument is
            ignored. (default: :obj:`None`)
        multiplier (float, optional): Coefficient by which features gets
            multiplied after pooling. This can be useful for large graphs and
            when :obj:`min_score` is used. (default: :obj:`1`)
        nonlinearity (torch.nn.functional, optional): The nonlinearity to use.
            (default: :obj:`torch.tanh`)
    """
    def __init__(self, in_channels, ratio=0.5, min_score=None, multiplier=1,
                 nonlinearity=torch.tanh):
        super(TopKPooling, self).__init__()

        self.in_channels = in_channels
        self.ratio = ratio
        self.min_score = min_score
        self.multiplier = multiplier
        self.nonlinearity = nonlinearity

        self.weight = Parameter(torch.Tensor(1, in_channels))

        self.reset_parameters()

    def reset_parameters(self):
        size = self.in_channels
        uniform(size, self.weight)

    def forward(self, x, edge_index, edge_attr=None, batch=None, attn=None):
        """"""

        if batch is None:
            batch = edge_index.new_zeros(x.size(0))

        attn = x if attn is None else attn
        attn = attn.unsqueeze(-1) if attn.dim() == 1 else attn
        score = (attn * self.weight).sum(dim=-1)

        #####  zero mean for each instance #########3
        score = score.view(batch.max() + 1, -1)
        score = score - score.mean(1,keepdim=True) #
        score = score.view(-1)

        if self.min_score is None:
            score = self.nonlinearity(score / self.weight.norm(p=2, dim=-1))
        else:
            score = softmax(score, batch)

        perm = topk(score, self.ratio, batch, self.min_score)
        x = x[perm] * score[perm].view(-1, 1)
        x = self.multiplier * x if self.multiplier != 1 else x

        batch = batch[perm]
        edge_index, edge_attr = filter_adj(edge_index, edge_attr, perm,
                                           num_nodes=score.size(0))

        # we changed the last returm term --- score, which are the scores for all the nodes
        return x, edge_index, edge_attr, batch, perm, score.view(batch.max()+1,-1)


    def __repr__(self):
        return '{}({}, {}={}, multiplier={})'.format(
            self.__class__.__name__, self.in_channels,
            'ratio' if self.min_score is None else 'min_score',
            self.ratio if self.min_score is None else self.min_score,
            self.multiplier)
Example #31
0
class PositionWiseFeedForward(nn.Module):
    """Two-layer Feed-forward neural network"""
    def __init__(self,
                 model_size,
                 inner_size,
                 dropout=0.,
                 variational=False,
                 activation='relu',
                 glu=False,
                 weight_drop=0.0):
        super().__init__()
        self.model_size = model_size
        self.inner_size = inner_size
        self.dropout = dropout
        self.bias = True
        self.variational = variational
        self.activation = activation
        self.glu = glu
        self.weight_drop = weight_drop
        self.autograd = False

        if self.activation == 'relu':
            if self.glu:
                self.act = nn.ReLU(inplace=True)
            else:
                self.act = ReLUDropout(p=self.dropout,
                                       variational=self.variational,
                                       batch_first=False)
        elif self.activation == 'gelu':
            self.act = nn.GELU()
        elif self.activation == 'agelu':
            self.act = AGELU()
        elif self.activation in ['silu', 'swish']:
            self.act = SiLU()
        elif self.activation in ['sigmoid']:
            if self.glu:
                self.act = nn.functional.glu
            else:
                print(
                    "Sigmoid activation function is recommended to be used with -glu"
                )
                raise NotImplementedError

        self.in_proj_weight = Parameter(
            torch.Tensor(inner_size * (2 if glu else 1), model_size))
        self.out_proj_weight = Parameter(torch.Tensor(model_size, inner_size))

        self.in_proj_bias = Parameter(
            torch.Tensor(inner_size * (2 if glu else 1)))
        self.out_proj_bias = Parameter(torch.Tensor(model_size))

        self.reset_parameters()
        self.optimized = 2

        self.fused = False

        # At the moment fused mlp is supported for RELU, SiLU, Swish, GELU and AGELU (approximated GELU)
        if not self.glu and \
                self.activation in ['relu', 'silu', 'swish', 'gelu', 'agelu'] and not self.variational:
            if self.activation == 'relu':
                from onmt.modules.mlp.mlp import mlp_relu_function
                if mlp_relu_function is not None:
                    self.fused_function = mlp_relu_function
                    self.fused = True
            elif self.activation in ['silu', 'swish']:
                from onmt.modules.mlp.mlp import mlp_silu_function
                if mlp_silu_function is not None:
                    self.fused_function = mlp_silu_function
                    self.fused = True
            elif self.activation == 'gelu':
                from onmt.modules.mlp.mlp import mlp_gelu_function
                if mlp_gelu_function is not None:
                    self.fused_function = mlp_gelu_function
                    self.fused = True
            elif self.activation == 'agelu':
                from onmt.modules.mlp.mlp import mlp_agelu_function
                if mlp_agelu_function is not None:
                    self.fused_function = mlp_agelu_function
                    self.fused = True

    def reset_parameters(self, init='normal'):
        if init == 'normal':
            std_ = math.sqrt(2.0 / (self.model_size + self.inner_size))
            nn.init.normal_(self.in_proj_weight, 0.0, std_)
            nn.init.normal_(self.out_proj_weight, 0.0, std_)
        else:
            std_ = math.sqrt(6.0 / (self.model_size + self.inner_size))
            nn.init.uniform_(self.in_proj_weight, -std_, std_)
            nn.init.uniform_(self.out_proj_weight, -std_, std_)

        nn.init.constant_(self.in_proj_bias, 0.0)
        nn.init.constant_(self.out_proj_bias, 0.0)

    def convert_autograd(self):

        if self.autograd:
            return

        with torch.no_grad():
            self.autograd = True
            self.linear_in = torch.nn.Linear(self.model_size, self.inner_size)
            self.linear_out = torch.nn.Linear(self.inner_size, self.model_size)

            self.linear_in.weight.copy_(self.in_proj_weight)
            self.linear_in.bias.copy_(self.in_proj_bias)
            self.linear_out.weight.copy_(self.out_proj_weight)
            self.linear_out.bias.copy_(self.out_proj_bias)

            del self.in_proj_weight
            del self.in_proj_bias
            del self.out_proj_weight
            del self.out_proj_bias

    def forward(self, input, *args):

        if self.fused and input.is_cuda:

            # if autocast is enabled: manually cast the function args into half manually
            # for some reason custom_fwd(...) doesn't work
            with autocast(enabled=False):
                weights = [
                    self.in_proj_weight.half(),
                    self.out_proj_weight.half()
                ]
                biases = [self.in_proj_bias.half(), self.out_proj_bias.half()]

                seq_len, bsz, hidden_size = input.size(0), input.size(
                    1), input.size(2)

                dropout = self.dropout if self.training else 0.0

                hidden = self.fused_function(
                    dropout,
                    input.half().view(seq_len * bsz, -1), *weights,
                    *biases).type_as(input)
                hidden = hidden.view(seq_len, bsz, hidden_size)

            # verification code (only with dropout = 0.0)
            # with torch.no_grad():
            #     hidden_ = F.linear(self.act(F.linear(input, self.in_proj_weight, self.in_proj_bias)),
            #                        self.out_proj_weight, self.out_proj_bias).type_as(hidden)
            #
            #     comp = torch.allclose(hidden, hidden_, rtol=1e-03, atol=1e-04)
            #     print(comp)

        else:
            if self.autograd:
                hidden = self.linear_in(input)
            else:
                hidden = F.linear(input, self.in_proj_weight,
                                  self.in_proj_bias)

            if self.glu and self.activation != 'sigmoid':
                hidden, gate = hidden.chunk(2, dim=-1)
                hidden = self.act(hidden) * gate
            else:  # GLU function
                hidden = self.act(hidden)

            if not (not self.glu and self.activation == 'relu'):
                if self.variational:
                    hidden = variational_dropout(
                        hidden,
                        p=self.dropout,
                        training=self.training,
                        inplace=self.activation
                        in ['silu', 'relu', 'swish', 'gelu'])
                else:
                    hidden = F.dropout(hidden,
                                       p=self.dropout,
                                       training=self.training,
                                       inplace=self.activation
                                       in ['silu', 'relu', 'swish', 'gelu'])

            if self.autograd:
                hidden = self.linear_out(hidden)
            else:
                hidden = F.linear(hidden, self.out_proj_weight,
                                  self.out_proj_bias)

        return hidden
Example #32
0
class Preprocessor(Module):
    def __init__(
        self,
        normalization_parameters: Dict[str, NormalizationParameters],
        use_gpu: bool,
        typed_output: bool = False,
    ) -> None:
        super(Preprocessor, self).__init__()
        self.normalization_parameters = normalization_parameters
        self.sorted_features, self.sorted_feature_boundaries = (
            self._sort_features_by_normalization()
        )
        self.typed_output = typed_output

        cuda_available = torch.cuda.is_available()
        logger.info("CUDA availability: {}".format(cuda_available))
        if use_gpu and cuda_available:
            logger.info("Using GPU: GPU requested and available.")
            self.use_gpu = True
            self.dtype = torch.cuda.FloatTensor
        else:
            logger.info("NOT Using GPU: GPU not requested or not available.")
            self.use_gpu = False
            self.dtype = torch.FloatTensor

        # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net,
        # We need to make tensors for every numeric literal
        self.zero_tensor = Parameter(
            torch.tensor([0.0]).type(self.dtype), requires_grad=False
        )
        self.one_tensor = Parameter(
            torch.tensor([1.0]).type(self.dtype), requires_grad=False
        )
        self.one_half_tensor = Parameter(
            torch.tensor([0.5]).type(self.dtype), requires_grad=False
        )
        self.one_hundredth_tensor = Parameter(
            torch.tensor([0.01]).type(self.dtype), requires_grad=False
        )
        self.negative_one_tensor = Parameter(
            torch.tensor([-1.0]).type(self.dtype), requires_grad=False
        )
        self.missing_tensor = Parameter(
            torch.tensor([MISSING_VALUE]).type(self.dtype), requires_grad=False
        )
        self.min_tensor = Parameter(
            torch.tensor([-1e20]).type(self.dtype), requires_grad=False
        )
        self.max_tensor = Parameter(
            torch.tensor([1e20]).type(self.dtype), requires_grad=False
        )
        self.epsilon_tensor = Parameter(
            torch.tensor([EPS]).type(self.dtype), requires_grad=False
        )

        feature_starts = self._get_type_boundaries()
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[self.sorted_features[j]]
                    func = getattr(self, "_create_parameters_" + feature_type)
                    func(j, norm_params)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                func = getattr(self, "_create_parameters_" + feature_type)
                func(begin_index, norm_params)

    def input_prototype(self):
        return rlt.FeatureVector(
            float_features=torch.randn(1, len(self.normalization_parameters))
        )

    def forward(self, input) -> torch.FloatTensor:
        """ Preprocess the input matrix
        :param input tensor
        """
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)
        if isinstance(input, rlt.FeatureVector):
            input = input.float_features.type(self.dtype)

        # ONNX doesn't support != yet
        not_missing_input = (
            self.one_tensor.float() - (input == self.missing_tensor).float()
        )
        feature_starts = self._get_type_boundaries()

        outputs = []
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[self.sorted_features[j]]
                    new_output = self._preprocess_feature_single_column(
                        j, input[:, j : j + 1], norm_params
                    )
                    new_output *= not_missing_input[:, j : j + 1]
                    self._check_preprocessing_output(new_output, [norm_params])
                    outputs.append(new_output)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                new_output = self._preprocess_feature_multi_column(
                    begin_index, input[:, begin_index:end_index], norm_params
                )
                new_output *= not_missing_input[:, begin_index:end_index]
                self._check_preprocessing_output(new_output, norm_params)
                outputs.append(new_output)

        def wrap(output):
            if self.typed_output:
                return rlt.FeatureVector(float_features=output)
            else:
                return output

        if len(outputs) == 1:
            return wrap(torch.clamp(outputs[0], MIN_FEATURE_VALUE, MAX_FEATURE_VALUE))

        return wrap(
            torch.clamp(torch.cat(outputs, dim=1), MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)
        )

    def _preprocess_feature_single_column(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: NormalizationParameters,
    ) -> torch.Tensor:
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)

        feature_type = norm_params.feature_type
        func = getattr(self, "_preprocess_" + feature_type)
        return func(begin_index, input, norm_params)

    def _preprocess_feature_multi_column(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        if isinstance(input, np.ndarray):
            input = torch.from_numpy(input).type(self.dtype)

        feature_type = norm_params[0].feature_type
        func = getattr(self, "_preprocess_" + feature_type)
        return func(begin_index, input, norm_params)

    def _create_parameters_BINARY(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        pass

    def _preprocess_BINARY(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        # ONNX doesn't support != yet
        return self.one_tensor - (input == self.zero_tensor).float()

    def _create_parameters_PROBABILITY(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        pass

    def _preprocess_PROBABILITY(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        clamped_input = torch.clamp(input, 0.01, 0.99)
        return self.negative_one_tensor * (
            ((self.one_tensor / clamped_input) - self.one_tensor).log()
        )

    def _create_parameters_CONTINUOUS_ACTION(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        self._create_parameter(
            begin_index,
            "min_serving_value",
            torch.Tensor([p.min_value for p in norm_params]).type(self.dtype),
        )
        self._create_parameter(
            begin_index,
            "min_training_value",
            torch.ones(len(norm_params)).type(self.dtype) * -1 + EPS,
        )
        self._create_parameter(
            begin_index,
            "scaling_factor",
            (torch.ones(len(norm_params)).type(self.dtype) - EPS)
            * 2
            / torch.tensor([p.max_value - p.min_value for p in norm_params]).type(
                self.dtype
            ),
        )

    def _preprocess_CONTINUOUS_ACTION(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        min_serving_value = self._fetch_parameter(begin_index, "min_serving_value")
        min_training_value = self._fetch_parameter(begin_index, "min_training_value")
        scaling_factor = self._fetch_parameter(begin_index, "scaling_factor")
        continuous_action = (
            input - min_serving_value
        ) * scaling_factor + min_training_value
        return torch.clamp(continuous_action, -1 + EPS, 1 - EPS)

    def _create_parameters_CONTINUOUS(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        self._create_parameter(
            begin_index,
            "means",
            torch.Tensor([p.mean for p in norm_params]).type(self.dtype),
        )
        self._create_parameter(
            begin_index,
            "stddevs",
            torch.Tensor([p.stddev for p in norm_params]).type(self.dtype),
        )

    def _preprocess_CONTINUOUS(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        means = self._fetch_parameter(begin_index, "means")
        stddevs = self._fetch_parameter(begin_index, "stddevs")
        continuous_output = (input - means) / stddevs
        return torch.clamp(continuous_output, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)

    def _create_parameters_BOXCOX(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        self._create_parameter(
            begin_index,
            "shifts",
            torch.Tensor([p.boxcox_shift for p in norm_params]).type(self.dtype),
        )
        for p in norm_params:
            assert (
                abs(p.boxcox_lambda) > 1e-6
            ), "Invalid value for boxcox lambda: " + str(p.boxcox_lambda)
        self._create_parameter(
            begin_index,
            "lambdas",
            torch.Tensor([p.boxcox_lambda for p in norm_params]).type(self.dtype),
        )
        self._create_parameters_CONTINUOUS(begin_index, norm_params)

    def _preprocess_BOXCOX(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        shifts = self._fetch_parameter(begin_index, "shifts")
        lambdas = self._fetch_parameter(begin_index, "lambdas")
        boxcox_output = (
            # We can replace this with a normal pow() call after D8528654 lands
            self._manual_broadcast_matrix_scalar(
                torch.clamp(
                    input + shifts, 1e-6
                ),  # Clamp is necessary to prevent MISSING_VALUE from going to NaN
                lambdas,
                torch.pow,
            )
            - self.one_tensor
        ) / lambdas
        return self._preprocess_CONTINUOUS(begin_index, boxcox_output, norm_params)

    def _create_parameters_QUANTILE(
        self, begin_index: int, norm_params: List[NormalizationParameters]
    ):
        F = len(norm_params)

        num_quantiles = torch.tensor(
            [[float(len(p.quantiles)) - 1 for p in norm_params]]
        ).type(self.dtype)
        self._create_parameter(begin_index, "num_quantiles", num_quantiles)

        max_num_quantile_boundaries = int(
            torch.max(torch.tensor([len(p.quantiles) for p in norm_params]))
        )
        B = max_num_quantile_boundaries

        # The quantile boundaries is a FxB matrix where B is the max # of boundaries

        # We take advantage of the fact that if the value is >= the max
        # quantile boundary it automatically gets a 1.0 to repeat the max quantile
        # so that we guarantee a square matrix.

        # We project the quantiles boundaries to 3d and create a 1xFxB tensor
        quantile_boundaries = torch.zeros(
            [1, len(norm_params), max_num_quantile_boundaries]
        ).type(self.dtype)
        max_quantile_boundaries = torch.zeros([1, len(norm_params)]).type(self.dtype)
        min_quantile_boundaries = torch.zeros([1, len(norm_params)]).type(self.dtype)
        for i, p in enumerate(norm_params):
            quantile_boundaries[0, i, :] = p.quantiles[-1]
            quantile_boundaries[0, i, 0 : len(p.quantiles)] = torch.tensor(
                p.quantiles
            ).type(self.dtype)
            max_quantile_boundaries[0, i] = max(p.quantiles)
            min_quantile_boundaries[0, i] = min(p.quantiles)

        quantile_boundaries = quantile_boundaries.type(self.dtype)
        max_quantile_boundaries = max_quantile_boundaries.type(self.dtype)
        min_quantile_boundaries = min_quantile_boundaries.type(self.dtype)

        self._create_parameter(begin_index, "quantile_boundaries", quantile_boundaries)
        self._create_parameter(
            begin_index, "max_quantile_boundaries", max_quantile_boundaries
        )
        self._create_parameter(
            begin_index, "min_quantile_boundaries", min_quantile_boundaries
        )
        self._create_parameter(
            begin_index,
            "quantile_boundary_mask",
            torch.ones([1, F, B]).type(self.dtype),
        )

    def _preprocess_QUANTILE(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: List[NormalizationParameters],
    ) -> torch.Tensor:
        """
        Replace the value with it's percentile in the range [0,1].

        This preprocesses several features in a single step by putting the
        quantile boundaries in the third dimension and broadcasting.

        The input is a JxF matrix where J is the batch size and F is the # of features.
        """

        # The number of quantiles is a 1xF matrix
        num_quantiles = self._fetch_parameter(begin_index, "num_quantiles")

        quantile_boundaries = self._fetch_parameter(begin_index, "quantile_boundaries")
        max_quantile_boundaries = self._fetch_parameter(
            begin_index, "max_quantile_boundaries"
        )
        min_quantile_boundaries = self._fetch_parameter(
            begin_index, "min_quantile_boundaries"
        )

        # Add a third dimension and repeat to create a JxFxB matrix, where the
        # inputs are repeated B times in the third dimension.  We need to
        # do this because we can't broadcast both operands in different
        # dimensions in the same operation.

        # repeat doesn't work yet, so * by a mask
        mask = self._fetch_parameter(begin_index, "quantile_boundary_mask")
        expanded_inputs = input.unsqueeze(2) * mask

        input_greater_than_or_equal_to = (
            expanded_inputs >= quantile_boundaries
        ).float()

        input_less_than = (expanded_inputs < quantile_boundaries).float()
        set_to_max = (input >= max_quantile_boundaries).float()
        set_to_min = (input <= min_quantile_boundaries).float()
        min_or_max = (set_to_min + set_to_max).float()
        interpolate = (min_or_max < self.one_hundredth_tensor).float()
        interpolate_left, _ = torch.max(
            (input_greater_than_or_equal_to * quantile_boundaries)
            + (input_less_than * self.min_tensor),
            dim=2,
        )
        interpolate_right, _ = torch.min(
            (input_less_than * quantile_boundaries)
            + (input_greater_than_or_equal_to * self.max_tensor),
            dim=2,
        )

        # This assumes that we need to interpolate and computes the value.
        # If we don't need to interpolate, this will be some bogus value, but it
        # will be multiplied by 0 so no big deal.
        left_start = torch.sum(input_greater_than_or_equal_to, dim=2) - self.one_tensor
        interpolated_values = (
            (
                left_start
                + (
                    (input - interpolate_left)
                    / (
                        (interpolate_right + self.epsilon_tensor) - interpolate_left
                    )  # Add a small amount to interpolate_right to avoid div-0
                )
            )
            / num_quantiles
        ).float()
        return set_to_max + (interpolate * interpolated_values).float()

    def _create_parameters_ENUM(
        self, begin_index: int, norm_params: NormalizationParameters
    ):
        self._create_parameter(
            begin_index,
            "enum_values",
            torch.Tensor(norm_params.possible_values).unsqueeze(0).type(self.dtype),
        )

    def _preprocess_ENUM(
        self,
        begin_index: int,
        input: torch.Tensor,
        norm_params: NormalizationParameters,
    ) -> torch.Tensor:
        enum_values = self._fetch_parameter(begin_index, "enum_values")
        return (input == enum_values).float()

    def _sort_features_by_normalization(self):
        """
        Helper function to return a sorted list from a normalization map.
        Also returns the starting index for each feature type"""
        # Sort features by feature type
        sorted_features = []
        feature_starts = []
        assert isinstance(
            list(self.normalization_parameters.keys())[0], int
        ), "Normalization Parameters need to be int"
        for feature_type in FEATURE_TYPES:
            feature_starts.append(len(sorted_features))
            for feature in sorted(self.normalization_parameters.keys()):
                norm = self.normalization_parameters[feature]
                if norm.feature_type == feature_type:
                    sorted_features.append(feature)
        return sorted_features, feature_starts

    def _get_type_boundaries(self) -> List[int]:
        feature_starts = []
        on_feature_type = -1
        for i, feature in enumerate(self.sorted_features):
            feature_type = self.normalization_parameters[feature].feature_type
            feature_type_index = FEATURE_TYPES.index(feature_type)
            assert (
                feature_type_index >= on_feature_type
            ), "Features are not sorted by feature type!"
            while feature_type_index > on_feature_type:
                feature_starts.append(i)
                on_feature_type += 1
        while on_feature_type < len(FEATURE_TYPES):
            feature_starts.append(len(self.sorted_features))
            on_feature_type += 1
        return feature_starts

    def _create_parameter(
        self, begin_index: int, name: str, t: torch.Tensor
    ) -> Parameter:
        p = Parameter(t, requires_grad=False)
        setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p)
        return p

    def _fetch_parameter(self, begin_index: int, name: str) -> Parameter:
        return getattr(self, "_auto_parameter_" + str(begin_index) + "_" + name)

    def _manual_broadcast_matrix_scalar(
        self, t1: torch.Tensor, s1: torch.Tensor, fn
    ) -> torch.Tensor:
        # Some ONNX ops don't support broadcasting so we need to do some matrix magic
        return fn(t1, (t1 * self.zero_tensor) + s1).float()

    def _manual_broadcast_column_vec_row_vec(
        self, t1: torch.Tensor, t2: torch.Tensor, fn
    ) -> torch.Tensor:
        # Some ONNX ops don't support broadcasting so we need to do some matrix magic
        t2_ones = t2 / t2
        t1_mask = t1.mm(t2_ones)

        return fn(t1_mask, t2).float()

    def _check_preprocessing_output(self, batch, norm_params):
        """
        Check that preprocessed features fall within range of valid output.
        :param batch: torch tensor
        :param norm_params: list of normalization parameters
        """
        feature_type = norm_params[0].feature_type
        min_value, max_value = batch.min(), batch.max()
        if feature_type == "CONTINUOUS":
            # Continuous features may be in range (-inf, inf)
            pass
        elif bool(max_value > MAX_FEATURE_VALUE):
            raise Exception(
                "A {} feature type has max value {} which is > than accepted post pre-processing max of {}".format(
                    feature_type, max_value, MAX_FEATURE_VALUE
                )
            )
        elif bool(min_value < MIN_FEATURE_VALUE):
            raise Exception(
                "A {} feature type has min value {} which is < accepted post pre-processing min of {}".format(
                    feature_type, min_value, MIN_FEATURE_VALUE
                )
            )
Example #33
0
class SelfAttnRNN(nn.Module):
    def __init__(self, args, data):
        super().__init__()
        self.n_input = 1
        self.m = data.m
        self.w = args.window
        self.hid = args.n_hidden
        self.rnn_cell = nn.RNNCell(input_size=self.n_input,
                                   hidden_size=self.hid)
        self.V = Parameter(torch.Tensor(self.hid, 1))
        self.Wx = Parameter(torch.Tensor(self.hid, self.n_input))
        self.Wtlt = Parameter(torch.Tensor(self.hid, self.hid))
        self.Wh = Parameter(torch.Tensor(self.hid, self.hid))
        self.init_weights()
        self.out = nn.Linear(self.hid, 1)

    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(
                    p.data)  # xavier_normal xavier_uniform_
            else:
                # nn.init.zeros_(p.data)
                stdv = 1. / math.sqrt(p.size(0))
                p.data.uniform_(-stdv, stdv)

    def forward(self, x):
        '''
        Args: x: (batch, time_step, m)  
        Returns: (batch, m)
        '''
        b, w, m = x.size()
        x = x.permute(0, 2, 1).contiguous().view(
            x.size(0) * x.size(2), x.size(1), self.n_input)  # x, 20, 1
        Htlt = []
        H = []
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        for step in range(self.w):  # forloop each history step
            x_tp1 = x[:, step, :]  # [x, 1]
            if step == 0:
                hx = torch.zeros(b * m, self.hid).to(device)
                H.append(hx)
                h_tlt = torch.zeros(b * m, self.hid).to(device)
            else:
                h_tlt = Htlt[-1]
            h_his = torch.stack(H, dim=1)
            if step > 0:
                x_tp1_rp = x_tp1.repeat(1, step + 1).view(b * m, step + 1, -1)
                h_tlt_rp = h_tlt.repeat(1, step + 1).view(b * m, step + 1, -1)
            else:
                x_tp1_rp = x_tp1
                h_tlt_rp = h_tlt
            q1 = x_tp1_rp @ self.Wx.t()  # [x, 20]
            q2 = h_tlt_rp @ self.Wtlt.t()  # [x, 20]
            q3 = h_his @ self.Wh.t()  # [x, 20]
            a = torch.tanh(q1 + q2 + q3) @ self.V  # [x, 1]
            a = torch.softmax(a, dim=-1)
            h_tlt_t = h_his * a
            h_tlt_t = torch.sum(h_tlt_t, dim=1)
            Htlt.append(h_tlt_t)
            hx = self.rnn_cell(x_tp1, h_tlt_t)  # [x, 20]
            H.append(hx)
        h = H[-1]
        out = self.out(h)
        out = out.view(b, m)
        return out, None
Example #34
0
class GPLVM(Parameterized):
    """
    Gaussian Process Latent Variable Model (GPLVM) model.

    GPLVM is a Gaussian Process model with its train input data is a latent variable.
    This model is useful for dimensional reduction of high dimensional data. Assume the
    mapping from low dimensional latent variable to is a Gaussian Process instance.
    Then the high dimensional data will play the role of train output ``y`` and our
    target is to learn latent inputs which best explain ``y``. For the purpose of
    dimensional reduction, latent inputs should have lower dimensions than ``y``.

    We follows reference [1] to put a unit Gaussian prior to the input and approximate
    its posterior by a multivariate normal distribution with two variational
    parameters: ``X_loc`` and ``X_scale_tril``.

    For example, we can do dimensional reduction on Iris dataset as follows:

        >>> # With y as the 2D Iris data of shape 150x4 and we want to reduce its dimension
        >>> # to a tensor X of shape 150x2, we will use GPLVM.

        .. doctest::
           :hide:

            >>> # Simulating iris data.
            >>> y = torch.stack([dist.Normal(4.8, 0.1).sample((150,)),
            ...                 dist.Normal(3.2, 0.3).sample((150,)),
            ...                 dist.Normal(1.5, 0.4).sample((150,)),
            ...                 dist.Exponential(0.5).sample((150,))])

        >>> # First, define the initial values for X_loc parameter:
        >>> X_loc = torch.zeros(150, 2)
        >>> # Then, define a Gaussian Process model with input X_loc and output y:
        >>> kernel = gp.kernels.RBF(input_dim=2, lengthscale=torch.ones(2))
        >>> Xu = torch.zeros(20, 2)  # initial inducing inputs of sparse model
        >>> gpmodel = gp.models.SparseGPRegression(X_loc, y, kernel, Xu)
        >>> # Finally, wrap gpmodel by GPLVM, optimize, and get the "learned" mean of X:
        >>> gplvm = gp.models.GPLVM(gpmodel)
        >>> gplvm.optimize()  # doctest: +SKIP
        >>> X = gplvm.get_param("X_loc")

    Reference:

    [1] Bayesian Gaussian Process Latent Variable Model
    Michalis K. Titsias, Neil D. Lawrence

    :param ~pyro.contrib.gp.models.model.GPModel base_model: A Pyro Gaussian Process
        model object. Note that ``base_model.X`` will be the initial value for the
        variational parameter ``X_loc``.
    :param str name: Name of this model.
    """
    def __init__(self, base_model, name="GPLVM"):
        super(GPLVM, self).__init__(name)
        if base_model.X.dim() != 2:
            raise ValueError("GPLVM model only works with 2D latent X, but got "
                             "X.dim() = {}.".format(base_model.X.dim()))
        self.base_model = base_model
        self.y = self.base_model.y

        self.X_loc = Parameter(self.base_model.X)

        C = self.X_loc.shape[1]
        X_scale_tril_shape = self.X_loc.shape + (C,)
        Id = torch.eye(C, out=self.X_loc.new_empty(C, C))
        X_scale_tril = Id.expand(X_scale_tril_shape)
        self.X_scale_tril = Parameter(X_scale_tril)
        self.set_constraint("X_scale_tril", constraints.lower_cholesky)

        self._call_base_model_guide = True

    def model(self):
        self.set_mode("model", recursive=False)

        # sample X from unit multivariate normal distribution
        zero_loc = self.X_loc.new_zeros(self.X_loc.shape)
        C = self.X_loc.shape[1]
        Id = torch.eye(C, out=self.X_loc.new_empty(C, C))
        X_name = param_with_module_name(self.name, "X")
        X = pyro.sample(X_name, dist.MultivariateNormal(zero_loc, scale_tril=Id)
                                    .independent(zero_loc.dim()-1))

        self.base_model.set_data(X, self.y)
        self.base_model.model()

    def guide(self):
        self.set_mode("guide", recursive=False)

        # sample X from variational multivariate normal distribution
        X_loc = self.get_param("X_loc")
        X_scale_tril = self.get_param("X_scale_tril")
        X_name = param_with_module_name(self.name, "X")
        X = pyro.sample(X_name,
                        dist.MultivariateNormal(X_loc, scale_tril=X_scale_tril)
                            .independent(X_loc.dim()-1))

        self.base_model.set_data(X, self.y)
        if self._call_base_model_guide:
            self.base_model.guide()

    def forward(self, **kwargs):
        """
        Forward method has the same signal as its ``base_model``. Note that the train
        input data of ``base_model`` is sampled from GPLVM.
        """
        # avoid calling base_model's guide two times
        self._call_base_model_guide = False
        self.guide()
        self._call_base_model_guide = True
        return self.base_model(**kwargs)

    def optimize(self, optimizer=optim.Adam({}), num_steps=1000):
        """
        A convenient method to optimize parameters for GPLVM model using
        :class:`~pyro.infer.svi.SVI`.

        :param ~optim.PyroOptim optimizer: A Pyro optimizer.
        :param int num_steps: Number of steps to run SVI.
        :returns: a list of losses during the training procedure
        :rtype: list
        """
        if not isinstance(optimizer, optim.PyroOptim):
            raise ValueError("Optimizer should be an instance of "
                             "pyro.optim.PyroOptim class.")
        svi = infer.SVI(self.model, self.guide, optimizer, loss=infer.Trace_ELBO())
        losses = []
        for i in range(num_steps):
            losses.append(svi.step())
        return losses
Example #35
0
    def __init__(self, args, data):
        super().__init__()
        self.x_h = 1
        self.f_h = data.m
        self.m = data.m
        self.d = data.d
        self.w = args.window
        self.h = args.horizon
        self.adj = data.adj
        self.o_adj = data.orig_adj
        if args.cuda:
            self.adj = sparse_mx_to_torch_sparse_tensor(
                normalize_adj2(data.orig_adj.cpu().numpy())).to_dense().cuda()
        else:
            self.adj = sparse_mx_to_torch_sparse_tensor(
                normalize_adj2(data.orig_adj.cpu().numpy())).to_dense()
        self.dropout = args.dropout
        self.n_hidden = args.n_hidden
        half_hid = int(self.n_hidden / 2)
        self.V = Parameter(torch.Tensor(half_hid))
        self.bv = Parameter(torch.Tensor(1))
        self.W1 = Parameter(torch.Tensor(half_hid, self.n_hidden))
        self.b1 = Parameter(torch.Tensor(half_hid))
        self.W2 = Parameter(torch.Tensor(half_hid, self.n_hidden))
        self.act = F.elu
        self.Wb = Parameter(torch.Tensor(self.m, self.m))
        self.wb = Parameter(torch.Tensor(1))
        self.k = args.k
        self.conv = nn.Conv1d(1, self.k, self.w)
        self.conv_long = nn.Conv1d(1, self.k, self.w - self.k, dilation=2)
        self.n_spatial = args.hidsp  #self.h  ####### check equal to k

        self.conv1 = GraphConvLayer(self.k * 3, self.n_hidden)  # self.k
        self.conv2 = GraphConvLayer(self.n_hidden, self.n_spatial)

        if args.rnn_model == 'LSTM':
            self.rnn = nn.LSTM(input_size=self.x_h,
                               hidden_size=self.n_hidden,
                               num_layers=args.n_layer,
                               dropout=args.dropout,
                               batch_first=True,
                               bidirectional=args.bi)
        elif args.rnn_model == 'GRU':
            self.rnn = nn.GRU(input_size=self.x_h,
                              hidden_size=self.n_hidden,
                              num_layers=args.n_layer,
                              dropout=args.dropout,
                              batch_first=True,
                              bidirectional=args.bi)
        elif args.rnn_model == 'RNN':
            self.rnn = nn.RNN(input_size=self.x_h,
                              hidden_size=self.n_hidden,
                              num_layers=args.n_layer,
                              dropout=args.dropout,
                              batch_first=True,
                              bidirectional=args.bi)
        else:
            raise LookupError(' only support LSTM, GRU and RNN')

        hidden_size = (int(args.bi) + 1) * self.n_hidden
        # self.n_hidden = hidden_size BIDIRECTIONAL BUG
        self.out = nn.Linear(hidden_size + self.n_spatial, 1)

        self.residual_window = 0
        self.ratio = 1.0
        if (self.residual_window > 0):
            self.residual_window = min(self.residual_window, args.window)
            self.residual = nn.Linear(self.residual_window, 1)
        self.init_weights()
Example #36
0
class ConvVDO(ModuleWrapper):

    def __init__(self, in_channels, out_channels, kernel_size, alpha_shape, stride=1,
                 padding=0, dilation=1, prior='loguni', bias=True):
        super(ConvVDO, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = (kernel_size, kernel_size)
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.alpha_shape = alpha_shape
        self.groups = 1
        self.weight = Parameter(torch.Tensor(
            out_channels, in_channels, *self.kernel_size))
        if bias:
            self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1))
        else:
            self.register_parameter('bias', None)
        self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups)
        self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups)
        self.log_alpha = Parameter(torch.Tensor(*alpha_shape))
        self.reset_parameters()
        self.zero_mean = False
        self.permute_sigma = False
        self.prior = prior
        if prior == 'loguni':
            self.kl_fun = metrics.kl_loguni
        else:
            self.kl_fun = metrics.kl_ard

    def reset_parameters(self):
        n = self.in_channels
        for k in self.kernel_size:
            n *= k
        stdv = 1. / math.sqrt(n)
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
        self.log_alpha.data.fill_(-5.0)

    def forward(self, x):
        if self.zero_mean:
            lrt_mean = self.op_bias(x, 0.0 * self.weight)
        else:
            lrt_mean = self.op_bias(x, self.weight)

        sigma2 = Variable.exp(self.log_alpha) * self.weight * self.weight
        if self.permute_sigma:
            sigma2 = sigma2.view(-1)[torch.randperm(self.weight.nelement()).cuda()].view(self.weight.shape)

        lrt_std = Variable.sqrt(1e-16 + self.op_nobias(x * x, sigma2))
        if self.training:
            eps = Variable(lrt_std.data.new(lrt_std.size()).normal_())
        else:
            eps = 0.0
        return lrt_mean + lrt_std * eps

    def kl_reg(self):
        return self.weight.nelement() / self.log_alpha.nelement() * metrics.kl_loguni(self.log_alpha)

    def __repr__(self):
        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        s += ', padding={padding}'
        s += ', alpha_shape=' + str(self.alpha_shape)
        s += ', prior=' + self.prior
        s += ', dilation={dilation}'
        if self.bias is None:
            s += ', bias=False'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)
Example #37
0
 def __init__(self, in_features: int, out_features: int):
     super(ArcMarginProduct, self).__init__()
     self.weight = Parameter(torch.FloatTensor(out_features, in_features), requires_grad=True)
     self.reset_parameters()
 def _build_model(self):
     self.linear = nn.utils.weight_norm(
         nn.Linear(self.input_dim, self.out_dim))
     self.bias = Parameter(torch.Tensor)
     self.register_buffer('running_mean', torch.zeros(self.out_dim))
     self.reset_parameter()
Example #39
0
 def beta(self, value):
     self._beta = Parameter(free_form(torch.as_tensor(value)))
Example #40
0
    def __init__(self,
                 model_size,
                 inner_size,
                 dropout=0.,
                 variational=False,
                 activation='relu',
                 glu=False,
                 weight_drop=0.0):
        super().__init__()
        self.model_size = model_size
        self.inner_size = inner_size
        self.dropout = dropout
        self.bias = True
        self.variational = variational
        self.activation = activation
        self.glu = glu
        self.weight_drop = weight_drop
        self.autograd = False

        if self.activation == 'relu':
            if self.glu:
                self.act = nn.ReLU(inplace=True)
            else:
                self.act = ReLUDropout(p=self.dropout,
                                       variational=self.variational,
                                       batch_first=False)
        elif self.activation == 'gelu':
            self.act = nn.GELU()
        elif self.activation == 'agelu':
            self.act = AGELU()
        elif self.activation in ['silu', 'swish']:
            self.act = SiLU()
        elif self.activation in ['sigmoid']:
            if self.glu:
                self.act = nn.functional.glu
            else:
                print(
                    "Sigmoid activation function is recommended to be used with -glu"
                )
                raise NotImplementedError

        self.in_proj_weight = Parameter(
            torch.Tensor(inner_size * (2 if glu else 1), model_size))
        self.out_proj_weight = Parameter(torch.Tensor(model_size, inner_size))

        self.in_proj_bias = Parameter(
            torch.Tensor(inner_size * (2 if glu else 1)))
        self.out_proj_bias = Parameter(torch.Tensor(model_size))

        self.reset_parameters()
        self.optimized = 2

        self.fused = False

        # At the moment fused mlp is supported for RELU, SiLU, Swish, GELU and AGELU (approximated GELU)
        if not self.glu and \
                self.activation in ['relu', 'silu', 'swish', 'gelu', 'agelu'] and not self.variational:
            if self.activation == 'relu':
                from onmt.modules.mlp.mlp import mlp_relu_function
                if mlp_relu_function is not None:
                    self.fused_function = mlp_relu_function
                    self.fused = True
            elif self.activation in ['silu', 'swish']:
                from onmt.modules.mlp.mlp import mlp_silu_function
                if mlp_silu_function is not None:
                    self.fused_function = mlp_silu_function
                    self.fused = True
            elif self.activation == 'gelu':
                from onmt.modules.mlp.mlp import mlp_gelu_function
                if mlp_gelu_function is not None:
                    self.fused_function = mlp_gelu_function
                    self.fused = True
            elif self.activation == 'agelu':
                from onmt.modules.mlp.mlp import mlp_agelu_function
                if mlp_agelu_function is not None:
                    self.fused_function = mlp_agelu_function
                    self.fused = True
Example #41
0
    def __init__(self,
                 basic_conv1=Conv2d_Hori_Veri_Cross,
                 basic_conv2=Conv2d_Diag_Cross,
                 theta=0.8):
        super(DC_CDN, self).__init__()

        self.conv1 = nn.Sequential(
            basic_conv1(3,
                        64,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )

        self.Block1 = nn.Sequential(
            basic_conv1(64,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv1(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv1(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.Block2 = nn.Sequential(
            basic_conv1(128,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv1(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv1(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.Block3 = nn.Sequential(
            basic_conv1(128,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv1(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv1(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.lastconv1 = nn.Sequential(
            basic_conv1(128 * 3,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
        )

        self.lastconv2 = nn.Sequential(
            basic_conv1(128,
                        64,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )

        self.lastconv3 = nn.Sequential(
            #basic_conv1(64, 1, kernel_size=3, stride=1, padding=1, bias=False, theta= theta),
            nn.Conv2d(128, 1, kernel_size=1, stride=1, padding=0, bias=False),
            nn.ReLU(),
        )

        # 2nd stream
        self.conv1_2 = nn.Sequential(
            basic_conv2(3,
                        64,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )

        self.Block1_2 = nn.Sequential(
            basic_conv2(64,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv2(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv2(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.Block2_2 = nn.Sequential(
            basic_conv2(128,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv2(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv2(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.Block3_2 = nn.Sequential(
            basic_conv2(128,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            basic_conv2(128,
                        196,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(196),
            nn.ReLU(),
            basic_conv2(196,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
        )

        self.lastconv1_2 = nn.Sequential(
            basic_conv2(128 * 3,
                        128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(128),
            nn.ReLU(),
        )

        self.lastconv2_2 = nn.Sequential(
            basic_conv2(128,
                        64,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        bias=False,
                        theta=theta),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )

        #self.lastconv3_2 = nn.Sequential(
        #    basic_conv2(64, 1, kernel_size=3, stride=1, padding=1, bias=False, theta= theta),
        #    #nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0, bias=False),
        #    nn.ReLU(),
        #)

        #self.HP_branch1 = Parameter(torch.ones([3,1]))
        self.HP_branch1 = Parameter(torch.zeros([3, 1]))
        #self.HP_branch2 = Parameter(torch.ones([3,1]))
        self.HP_branch2 = Parameter(torch.zeros([3, 1]))

        self.downsample32x32 = nn.Upsample(size=(32, 32), mode='bilinear')
Example #42
0
 def __init__(self, feat_dim, num_class, margin=0.35, scale=32):
     super(AM_Softmax, self).__init__()
     self.weight = Parameter(torch.Tensor(feat_dim, num_class))
     self.weight.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5)
     self.margin = margin
     self.scale = scale
class MultiheadAttention(nn.Module):
    """Multi-headed attention.
    See "Attention Is All You Need" for more details.
    """

    def __init__(self, embed_dim, num_heads, attn_dropout=0.,
                 bias=True, add_bias_kv=False, add_zero_attn=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.attn_dropout = attn_dropout
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim ** -0.5

        self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
        self.register_parameter('in_proj_bias', None)
        if bias:
            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.in_proj_weight)
        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.in_proj_bias is not None:
            nn.init.constant_(self.in_proj_bias, 0.)
            nn.init.constant_(self.out_proj.bias, 0.)
        if self.bias_k is not None:
            nn.init.xavier_normal_(self.bias_k)
        if self.bias_v is not None:
            nn.init.xavier_normal_(self.bias_v)

    def forward(self, query, key, value, attn_mask=None):
        """Input shape: Time x Batch x Channel
        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Timesteps can be masked by supplying a T x T mask in the
        `attn_mask` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """
        import pdb
        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
        kv_same = key.data_ptr() == value.data_ptr()

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]
        assert key.size() == value.size()

        aved_state = None

        if qkv_same:
            # self-attention
            q, k, v = self.in_proj_qkv(query)
        elif kv_same:
            # encoder-decoder attention
            q = self.in_proj_q(query)

            if key is None:
                assert value is None
                k = v = None
            else:
                k, v = self.in_proj_kv(key)
        else:
            q = self.in_proj_q(query)
            k = self.in_proj_k(key)
            v = self.in_proj_v(value)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)

        #q 50 24 30 
        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
        #q 120 50 6
        if k is not None:
            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
        if v is not None:
            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)

        src_len = k.size(1)

        if self.add_zero_attn:
            src_len += 1
            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
        
        attn_weights = torch.bmm(q, k.transpose(1, 2))
        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            try:
                attn_weights += attn_mask.unsqueeze(0)
            except:
                print(attn_weights.shape)
                print(attn_mask.unsqueeze(0).shape)
                assert False
                
        attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights)
        # attn_weights = F.relu(attn_weights)
        # attn_weights = attn_weights / torch.max(attn_weights)
        attn_weights = F.dropout(attn_weights, p=self.attn_dropout, training=self.training)

        attn = torch.bmm(attn_weights, v)
        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]

        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)

        # average attention weights over heads
        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
        attn_weights = attn_weights.sum(dim=1) / self.num_heads
        return attn, attn_weights

    def in_proj_qkv(self, query):
        return self._in_proj(query).chunk(3, dim=-1)

    def in_proj_kv(self, key):
        return self._in_proj(key, start=self.embed_dim).chunk(2, dim=-1)

    def in_proj_q(self, query, **kwargs):
        return self._in_proj(query, end=self.embed_dim, **kwargs)

    def in_proj_k(self, key):
        return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)

    def in_proj_v(self, value):
        return self._in_proj(value, start=2 * self.embed_dim)

    def _in_proj(self, input, start=0, end=None, **kwargs):
        weight = kwargs.get('weight', self.in_proj_weight)
        bias = kwargs.get('bias', self.in_proj_bias)
        weight = weight[start:end, :]
        if bias is not None:
            bias = bias[start:end]
        return F.linear(input, weight, bias)
Example #44
0
 def test_parameter_sharing(self):
     param = Parameter(torch.arange(1., 26).view(5, 5))
     self._test_autograd_sharing(param, is_parameter=True)
Example #45
0
    def __init__(self,
                 *,
                 inputSize,
                 hiddenSize,
                 train=True,
                 dr=0.5,
                 drMethod='gal+sem',
                 gpu=0):
        super(LSTMcell_untied, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = inputSize
        self.dr = dr

        self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize))

        self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize))

        self.b_i = Parameter(torch.Tensor(hiddenSize))
        self.b_f = Parameter(torch.Tensor(hiddenSize))
        self.b_o = Parameter(torch.Tensor(hiddenSize))
        self.b_c = Parameter(torch.Tensor(hiddenSize))

        self.drMethod = drMethod.split('+')
        self.gpu = gpu
        self.train = train
        if gpu >= 0:
            self = self.cuda(gpu)
            self.is_cuda = True
        else:
            self.is_cuda = False
        self.reset_parameters()
Example #46
0
 def test_cuda_parameter_sharing(self):
     param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5))
     self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
 def __init__(self):
     super().__init__()
     self.mu = Parameter(FloatTensor([0.0]))
     self.log_variance = Parameter(FloatTensor([0.0]))
    def __init__(self, num_layers=50, drop_ratio=0.4, mode='ir_se',
                 stn_mode='resnet'):
        super(ResnetFaceSTN, self).__init__()
        assert num_layers in [50, 100, 152]
        assert mode in ['ir', 'ir_se']
        assert stn_mode in ['resnet', 'cbp']

        blocks = get_blocks(num_layers)
        if mode == 'ir':
            unit_module = bottleneck_IR
        elif mode == 'ir_se':
            unit_module = bottleneck_IR_SE


        if stn_mode == 'cbp':
            self.localization = Sequential(
                Conv2d(3, 16, kernel_size=7, padding=1, stride=2, bias=False),
                BatchNorm2d(16),
                PReLU(16),
                Conv2d(16, 32, kernel_size=5, padding=1, stride=2, bias=False),
                BatchNorm2d(32),
                PReLU(32),
                Conv2d(32, 32, kernel_size=5, padding=1, stride=2, bias=False),
                BatchNorm2d(32),
                PReLU(32),
                Conv2d(32, 64, kernel_size=5, padding=1, stride=2, bias=False),
                BatchNorm2d(64),
                PReLU(64),
                Conv2d(64, 64, kernel_size=5, padding=1, stride=1, bias=False),
                BatchNorm2d(64),
                PReLU(64)
            )

            self.fc_loc = Sequential(
                Flatten(),
                Linear(64 * 4 * 4, 6)
            )

        elif stn_mode == 'resnet':
            self.localization = Sequential(
                bottleneck_IR(3, 16, 2),
                bottleneck_IR(16, 32, 2),
                bottleneck_IR(32, 32, 2),
                bottleneck_IR(32, 64, 2),
                bottleneck_IR(64, 64, 1),
                torch.nn.AdaptiveAvgPool2d(1)
            )

            self.fc_loc = Sequential(
                Flatten(),
                Linear(64 * 1 * 1, 6)
            )


        self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
                                      BatchNorm2d(64),
                                      PReLU(64))
        self.output_layer = Sequential(BatchNorm2d(512),
                                       Dropout(drop_ratio),
                                       Flatten(),
                                       Linear(512 * 7 * 7, 512),
                                       BatchNorm1d(512))
        modules = []
        for block in blocks:
            for bottleneck in block:
                modules.append(
                    unit_module(bottleneck.in_channel,
                                bottleneck.depth,
                                bottleneck.stride))
        self.body = Sequential(*modules)

        self.fc_loc[1].weight.data.zero_()
        # WARNING remember to change the bias according to input size
        # NOTE for img size 128 -> 112
        self.fc_loc[1].bias.data.copy_(torch.tensor([1, 0, 0,
                                                     0, 1, 0],
                                                    dtype=torch.float32))

        self.warp_param_adder = Parameter(torch.ones(1, 1))
Example #49
0
 def _create_parameter(self, begin_index: int, name: str,
                       t: torch.Tensor) -> Parameter:
     p = Parameter(t, requires_grad=False)
     setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p)
     return p
Example #50
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 dilation,
                 output_padding,
                 groups,
                 p_logvar_init=-3,
                 p_pi=1.0,
                 q_logvar_init=-5):
        super(_ConvNd, self).__init__()
        if in_channels % groups != 0:
            raise ValueError('in_channels must be divisible by groups')
        if out_channels % groups != 0:
            raise ValueError('out_channels must be divisible by groups')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.output_padding = output_padding
        self.groups = groups

        # initialize log variance of p and q
        self.p_logvar_init = p_logvar_init
        self.q_logvar_init = q_logvar_init

        # approximate posterior weights...
        self.qw_mean = Parameter(
            torch.Tensor(out_channels, in_channels // groups, *kernel_size))
        self.qw_logvar = Parameter(
            torch.Tensor(out_channels, in_channels // groups, *kernel_size))

        # optionally add bias
        # self.qb_mean = Parameter(torch.Tensor(out_channels))
        # self.qb_logvar = Parameter(torch.Tensor(out_channels))

        # ...and output...
        self.conv_qw_mean = Parameter(
            torch.Tensor(out_channels, in_channels // groups, *kernel_size))
        self.conv_qw_std = Parameter(
            torch.Tensor(out_channels, in_channels // groups, *kernel_size))

        # ...as normal distributions
        self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar)
        # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar)

        self.conv_qw = Normalout(mu=self.conv_qw_mean, si=self.conv_qw_std)

        # initialise
        self.log_alpha = Parameter(torch.Tensor(1, 1))

        # prior model
        # (does not have any trainable parameters so we use fixed normal or fixed mixture normal distributions)
        self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)
        # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)

        # initialize all parameters
        self.reset_parameters()
Example #51
0
    def __init__(
        self,
        normalization_parameters: Dict[str, NormalizationParameters],
        use_gpu: bool,
        typed_output: bool = False,
    ) -> None:
        super(Preprocessor, self).__init__()
        self.normalization_parameters = normalization_parameters
        self.sorted_features, self.sorted_feature_boundaries = (
            self._sort_features_by_normalization()
        )
        self.typed_output = typed_output

        cuda_available = torch.cuda.is_available()
        logger.info("CUDA availability: {}".format(cuda_available))
        if use_gpu and cuda_available:
            logger.info("Using GPU: GPU requested and available.")
            self.use_gpu = True
            self.dtype = torch.cuda.FloatTensor
        else:
            logger.info("NOT Using GPU: GPU not requested or not available.")
            self.use_gpu = False
            self.dtype = torch.FloatTensor

        # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net,
        # We need to make tensors for every numeric literal
        self.zero_tensor = Parameter(
            torch.tensor([0.0]).type(self.dtype), requires_grad=False
        )
        self.one_tensor = Parameter(
            torch.tensor([1.0]).type(self.dtype), requires_grad=False
        )
        self.one_half_tensor = Parameter(
            torch.tensor([0.5]).type(self.dtype), requires_grad=False
        )
        self.one_hundredth_tensor = Parameter(
            torch.tensor([0.01]).type(self.dtype), requires_grad=False
        )
        self.negative_one_tensor = Parameter(
            torch.tensor([-1.0]).type(self.dtype), requires_grad=False
        )
        self.missing_tensor = Parameter(
            torch.tensor([MISSING_VALUE]).type(self.dtype), requires_grad=False
        )
        self.min_tensor = Parameter(
            torch.tensor([-1e20]).type(self.dtype), requires_grad=False
        )
        self.max_tensor = Parameter(
            torch.tensor([1e20]).type(self.dtype), requires_grad=False
        )
        self.epsilon_tensor = Parameter(
            torch.tensor([EPS]).type(self.dtype), requires_grad=False
        )

        feature_starts = self._get_type_boundaries()
        for i, feature_type in enumerate(FEATURE_TYPES):
            begin_index = feature_starts[i]
            if (i + 1) == len(FEATURE_TYPES):
                end_index = len(self.normalization_parameters)
            else:
                end_index = feature_starts[i + 1]
            if begin_index == end_index:
                continue  # No features of this type
            if feature_type == ENUM:
                # Process one-at-a-time
                for j in range(begin_index, end_index):
                    norm_params = self.normalization_parameters[self.sorted_features[j]]
                    func = getattr(self, "_create_parameters_" + feature_type)
                    func(j, norm_params)
            else:
                norm_params = []
                for f in self.sorted_features[begin_index:end_index]:
                    norm_params.append(self.normalization_parameters[f])
                func = getattr(self, "_create_parameters_" + feature_type)
                func(begin_index, norm_params)
Example #52
0
class VariationalSparseGP(GPModel):
    r"""
    Variational Sparse Gaussian Process model.

    In :class:`.VariationalGP` model, when the number of input data :math:`X` is large,
    the covariance matrix :math:`k(X, X)` will require a lot of computational steps to
    compute its inverse (for log likelihood and for prediction). This model introduces
    an additional inducing-input parameter :math:`X_u` to solve that problem. Given
    inputs :math:`X`, their noisy observations :math:`y`, and the inducing-input
    parameters :math:`X_u`, the model takes the form:

    .. math::
        [f, u] &\sim \mathcal{GP}(0, k([X, X_u], [X, X_u])),\\
        y & \sim p(y) = p(y \mid f) p(f),

    where :math:`p(y \mid f)` is the likelihood.

    We will use a variational approach in this model by approximating :math:`q(f,u)`
    to the posterior :math:`p(f,u \mid y)`. Precisely, :math:`q(f) = p(f\mid u)q(u)`,
    where :math:`q(u)` is a multivariate normal distribution with two parameters
    ``u_loc`` and ``u_scale_tril``, which will be learned during a variational
    inference process.

    .. note:: This model can be learned using MCMC method as in reference [2]. See also
        :class:`.GPModel`.

    .. note:: This model has :math:`\mathcal{O}(NM^2)` complexity for training,
        :math:`\mathcal{O}(M^3)` complexity for testing. Here, :math:`N` is the number
        of train inputs, :math:`M` is the number of inducing inputs. Size of
        variational parameters is :math:`\mathcal{O}(M^2)`.

    References:

    [1] `Scalable variational Gaussian process classification`,
    James Hensman, Alexander G. de G. Matthews, Zoubin Ghahramani

    [2] `MCMC for Variationally Sparse Gaussian Processes`,
    James Hensman, Alexander G. de G. Matthews, Maurizio Filippone, Zoubin Ghahramani

    :param torch.Tensor X: A input data for training. Its first dimension is the number
        of data points.
    :param torch.Tensor y: An output data for training. Its last dimension is the
        number of data points.
    :param ~pyro.contrib.gp.kernels.kernel.Kernel kernel: A Pyro kernel object, which
        is the covariance function :math:`k`.
    :param torch.Tensor Xu: Initial values for inducing points, which are parameters
        of our model.
    :param ~pyro.contrib.gp.likelihoods.likelihood Likelihood likelihood: A likelihood
        object.
    :param callable mean_function: An optional mean function :math:`m` of this Gaussian
        process. By default, we use zero mean.
    :param torch.Size latent_shape: Shape for latent processes (`batch_shape` of
        :math:`q(u)`). By default, it equals to output batch shape ``y.shape[:-1]``.
        For the multi-class classification problems, ``latent_shape[-1]`` should
        corresponse to the number of classes.
    :param int num_data: The size of full training dataset. It is useful for training
        this model with mini-batch.
    :param bool whiten: A flag to tell if variational parameters ``u_loc`` and
        ``u_scale_tril`` are transformed by the inverse of ``Luu``, where ``Luu`` is
        the lower triangular decomposition of :math:`kernel(X_u, X_u)`. Enable this
        flag will help optimization.
    :param float jitter: A small positive term which is added into the diagonal part of
        a covariance matrix to help stablize its Cholesky decomposition.
    """
    def __init__(self,
                 X,
                 y,
                 kernel,
                 Xu,
                 likelihood,
                 mean_function=None,
                 latent_shape=None,
                 num_data=None,
                 whiten=False,
                 jitter=1e-6):
        super().__init__(X, y, kernel, mean_function, jitter)

        self.likelihood = likelihood
        self.Xu = Parameter(Xu)

        y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size(
            [])
        self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape

        M = self.Xu.size(0)
        u_loc = self.Xu.new_zeros(self.latent_shape + (M, ))
        self.u_loc = Parameter(u_loc)

        identity = eye_like(self.Xu, M)
        u_scale_tril = identity.repeat(self.latent_shape + (1, 1))
        self.u_scale_tril = PyroParam(u_scale_tril, constraints.lower_cholesky)

        self.num_data = num_data if num_data is not None else self.X.size(0)
        self.whiten = whiten
        self._sample_latent = True

    @pyro_method
    def model(self):
        self.set_mode("model")

        M = self.Xu.size(0)
        Kuu = self.kernel(self.Xu).contiguous()
        Kuu.view(-1)[::M + 1] += self.jitter  # add jitter to the diagonal
        Luu = Kuu.cholesky()

        zero_loc = self.Xu.new_zeros(self.u_loc.shape)
        if self.whiten:
            identity = eye_like(self.Xu, M)
            pyro.sample(
                self._pyro_get_fullname("u"),
                dist.MultivariateNormal(
                    zero_loc,
                    scale_tril=identity).to_event(zero_loc.dim() - 1))
        else:
            pyro.sample(
                self._pyro_get_fullname("u"),
                dist.MultivariateNormal(
                    zero_loc, scale_tril=Luu).to_event(zero_loc.dim() - 1))

        f_loc, f_var = conditional(self.X,
                                   self.Xu,
                                   self.kernel,
                                   self.u_loc,
                                   self.u_scale_tril,
                                   Luu,
                                   full_cov=False,
                                   whiten=self.whiten,
                                   jitter=self.jitter)

        f_loc = f_loc + self.mean_function(self.X)
        if self.y is None:
            return f_loc, f_var
        else:
            # we would like to load likelihood's parameters outside poutine.scale context
            self.likelihood._load_pyro_samples()
            with poutine.scale(scale=self.num_data / self.X.size(0)):
                return self.likelihood(f_loc, f_var, self.y)

    @pyro_method
    def guide(self):
        self.set_mode("guide")
        self._load_pyro_samples()

        pyro.sample(
            self._pyro_get_fullname("u"),
            dist.MultivariateNormal(
                self.u_loc,
                scale_tril=self.u_scale_tril).to_event(self.u_loc.dim() - 1))

    def forward(self, Xnew, full_cov=False):
        r"""
        Computes the mean and covariance matrix (or variance) of Gaussian Process
        posterior on a test input data :math:`X_{new}`:

        .. math:: p(f^* \mid X_{new}, X, y, k, X_u, u_{loc}, u_{scale\_tril})
            = \mathcal{N}(loc, cov).

        .. note:: Variational parameters ``u_loc``, ``u_scale_tril``, the
            inducing-point parameter ``Xu``, together with kernel's parameters have
            been learned from a training procedure (MCMC or SVI).

        :param torch.Tensor Xnew: A input data for testing. Note that
            ``Xnew.shape[1:]`` must be the same as ``self.X.shape[1:]``.
        :param bool full_cov: A flag to decide if we want to predict full covariance
            matrix or just variance.
        :returns: loc and covariance matrix (or variance) of :math:`p(f^*(X_{new}))`
        :rtype: tuple(torch.Tensor, torch.Tensor)
        """
        self._check_Xnew_shape(Xnew)
        self.set_mode("guide")

        loc, cov = conditional(Xnew,
                               self.Xu,
                               self.kernel,
                               self.u_loc,
                               self.u_scale_tril,
                               full_cov=full_cov,
                               whiten=self.whiten,
                               jitter=self.jitter)
        return loc + self.mean_function(Xnew), cov
Example #53
0
class ChebConvAtt(MessagePassing):
    r"""The chebyshev spectral graph convolutional operator with attention from the
    `Attention Based Spatial-Temporal Graph Convolutional 
    Networks for Traffic Flow Forecasting." <https://ojs.aaai.org/index.php/AAAI/article/view/3881>`_ paper
    :math:`\mathbf{\hat{L}}` denotes the scaled and normalized Laplacian
    :math:`\frac{2\mathbf{L}}{\lambda_{\max}} - \mathbf{I}`.
    
    Args:
        in_channels (int): Size of each input sample.
        out_channels (int): Size of each output sample.
        K (int): Chebyshev filter size :math:`K`.
        normalization (str, optional): The normalization scheme for the graph
            Laplacian (default: :obj:`"sym"`):
            1. :obj:`None`: No normalization
            :math:`\mathbf{L} = \mathbf{D} - \mathbf{A}`
            2. :obj:`"sym"`: Symmetric normalization
            :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1/2} \mathbf{A}
            \mathbf{D}^{-1/2}`
            3. :obj:`"rw"`: Random-walk normalization
            :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1} \mathbf{A}`
            You need to pass :obj:`lambda_max` to the :meth:`forward` method of
            this operator in case the normalization is non-symmetric.
            :obj:`\lambda_max` should be a :class:`torch.Tensor` of size
            :obj:`[num_graphs]` in a mini-batch scenario and a
            scalar/zero-dimensional tensor when operating on single graphs.
            You can pre-compute :obj:`lambda_max` via the
            :class:`torch_geometric.transforms.LaplacianLambdaMax` transform.
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
        **kwargs (optional): Additional arguments of
            :class:`torch_geometric.nn.conv.MessagePassing`.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 K,
                 normalization=None,
                 bias=True,
                 **kwargs):
        kwargs.setdefault('aggr', 'add')
        super(ChebConvAtt, self).__init__(**kwargs)

        assert K > 0
        assert normalization in [None, 'sym', 'rw'], 'Invalid normalization'

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.normalization = normalization
        self.weight = Parameter(torch.Tensor(K, in_channels, out_channels))

        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        glorot(self.weight)
        zeros(self.bias)

    def __norm__(self,
                 edge_index,
                 num_nodes: Optional[int],
                 edge_weight: OptTensor,
                 normalization: Optional[str],
                 lambda_max,
                 dtype: Optional[int] = None,
                 batch: OptTensor = None):

        edge_index, edge_weight = remove_self_loops(edge_index, edge_weight)

        edge_index, edge_weight = get_laplacian(edge_index, edge_weight,
                                                normalization, dtype,
                                                num_nodes)

        if batch is not None and lambda_max.numel() > 1:
            lambda_max = lambda_max[batch[edge_index[0]]]

        edge_weight = (2.0 * edge_weight) / lambda_max
        edge_weight.masked_fill_(edge_weight == float('inf'), 0)

        edge_index, edge_weight = add_self_loops(edge_index,
                                                 edge_weight,
                                                 fill_value=-1.,
                                                 num_nodes=num_nodes)
        assert edge_weight is not None

        return edge_index, edge_weight

    def forward(self,
                x,
                edge_index,
                spatial_attention,
                edge_weight: OptTensor = None,
                batch: OptTensor = None,
                lambda_max: OptTensor = None):
        """
        Making a forward pass of the ChebConvAtt layer.
        B is the batch size. N_nodes is the number of nodes in the graph. 
        F_in is the dimension of input features (in_channels). 
        F_out is the dimension of input features (out_channels). 
        
        Arg types:
            * x (PyTorch Float Tensor) - Node features for T time periods, with shape (B, N_nodes, F_in).
            * edge_index (Tensor array) - Edge indices.
            * spatial_attention (PyTorch Float Tensor) - Spatial attention weights, with shape (B, N_nodes, N_nodes).
            * edge_weight (PyTorch Float Tensor, optional) - Edge weights corresponding to edge indices.
            * batch (PyTorch Tensor, optional) - Batch labels for each edge.
            * lambda_max (optional, but mandatory if normalization is None) - Largest eigenvalue of Laplacian.

        Return types:
            * output (PyTorch Float Tensor) - Hidden state tensor for all nodes, with shape (B, N_nodes, F_out).
        """
        if self.normalization != 'sym' and lambda_max is None:
            raise ValueError('You need to pass `lambda_max` to `forward() in`'
                             'case the normalization is non-symmetric.')

        if lambda_max is None:
            lambda_max = torch.tensor(2.0, dtype=x.dtype, device=x.device)
        if not isinstance(lambda_max, torch.Tensor):
            lambda_max = torch.tensor(lambda_max,
                                      dtype=x.dtype,
                                      device=x.device)
        assert lambda_max is not None

        edge_index, norm = self.__norm__(edge_index,
                                         x.size(self.node_dim),
                                         edge_weight,
                                         self.normalization,
                                         lambda_max,
                                         dtype=x.dtype,
                                         batch=batch)
        row, col = edge_index
        Att_norm = norm * spatial_attention[:, row, col]
        num_nodes = x.size(self.node_dim)
        TAx_0 = torch.matmul(
            (torch.eye(num_nodes) * spatial_attention).permute(0, 2, 1), x)
        out = torch.matmul(TAx_0, self.weight[0])
        # L_tilde = torch.sparse_coo_tensor(edge_index,norm,(num_nodes,num_nodes)).to_dense()
        # propagate_type: (x: Tensor, norm: Tensor)
        edge_index_transpose = edge_index[[
            1, 0
        ]]  # transpose according to the paper
        if self.weight.size(0) > 1:
            TAx_1 = self.propagate(edge_index_transpose,
                                   x=TAx_0,
                                   norm=Att_norm,
                                   size=None)
            out = out + torch.matmul(TAx_1, self.weight[1])

        for k in range(2, self.weight.size(0)):
            TAx_2 = self.propagate(edge_index_transpose,
                                   x=TAx_1,
                                   norm=norm,
                                   size=None)
            TAx_2 = 2. * TAx_2 - TAx_0
            out = out + torch.matmul(TAx_2, self.weight[k])
            TAx_0, TAx_1 = TAx_1, TAx_2

        if self.bias is not None:
            out += self.bias

        return out

    def message(self, x_j, norm):
        if norm.dim() == 1:
            return norm.view(-1, 1) * x_j
        else:
            d1, d2 = norm.shape
            return norm.view(d1, d2, 1) * x_j

    def __repr__(self):
        return '{}({}, {}, K={}, normalization={})'.format(
            self.__class__.__name__, self.in_channels, self.out_channels,
            self.weight.size(0), self.normalization)
class AlternatingHighwayLSTM(torch.nn.Module):
    """
    A stacked LSTM with LSTM layers which alternate between going forwards over
    the sequence and going backwards, with highway connections between each of
    the alternating layers. This implementation is based on the description in
    `Deep Semantic Role Labelling - What works and what's next
    <https://homes.cs.washington.edu/~luheng/files/acl2017_hllz.pdf>`_ .

    Parameters
    ----------
    input_size : int, required
        The dimension of the inputs to the LSTM.
    hidden_size : int, required
        The dimension of the outputs of the LSTM.
    num_layers : int, required
        The number of stacked LSTMs to use.
    recurrent_dropout_probability: float, optional (default = 0.0)
        The dropout probability to be used in a dropout scheme as stated in
        `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks
        <https://arxiv.org/abs/1512.05287>`_ .

    Returns
    -------
    output : PackedSequence
        The outputs of the interleaved LSTMs per timestep. A tensor of shape
        (batch_size, max_timesteps, hidden_size) where for a given batch
        element, all outputs past the sequence length for that batch are
        zero tensors.
    """
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 num_layers: int = 1,
                 recurrent_dropout_probability: float = 0) -> None:
        super(AlternatingHighwayLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.recurrent_dropout_probability = recurrent_dropout_probability
        self.training = True

        # Input dimensions consider the fact that we do
        # all of the LSTM projections (and highway parts)
        # in a single matrix multiplication.
        input_projection_size = 6 * hidden_size
        state_projection_size = 5 * hidden_size
        bias_size = 5 * hidden_size

        # Here we are creating a single weight and bias with the
        # parameters for all layers unfolded into it. This is necessary
        # because unpacking and re-packing the weights inside the
        # kernel would be slow, as it would happen every time it is called.
        total_weight_size = 0
        total_bias_size = 0
        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else hidden_size

            input_weights = input_projection_size * layer_input_size
            state_weights = state_projection_size * hidden_size
            total_weight_size += input_weights + state_weights

            total_bias_size += bias_size

        self.weight = Parameter(torch.FloatTensor(total_weight_size))
        self.bias = Parameter(torch.FloatTensor(total_bias_size))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        self.bias.data.zero_()
        weight_index = 0
        bias_index = 0
        for i in range(self.num_layers):
            input_size = self.input_size if i == 0 else self.hidden_size

            # Create a tensor of the right size and initialize it.
            init_tensor = self.weight.new_zeros(input_size, self.hidden_size * 6)
            block_orthogonal(init_tensor, [input_size, self.hidden_size])
            # Copy it into the flat weight.
            self.weight.data[weight_index: weight_index + init_tensor.nelement()]\
                .view_as(init_tensor).copy_(init_tensor)
            weight_index += init_tensor.nelement()

            # Same for the recurrent connection weight.
            init_tensor = self.weight.new_zeros(self.hidden_size, self.hidden_size * 5)
            block_orthogonal(init_tensor, [self.hidden_size, self.hidden_size])
            self.weight.data[weight_index: weight_index + init_tensor.nelement()]\
                .view_as(init_tensor).copy_(init_tensor)
            weight_index += init_tensor.nelement()

            # Set the forget bias to 1.
            self.bias.data[bias_index + self.hidden_size:bias_index + 2 * self.hidden_size].fill_(1)
            bias_index += 5 * self.hidden_size

    def forward(self, inputs: PackedSequence,  # pylint: disable=arguments-differ
                # pylint: disable=unused-argument
                initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            Currently, this is ignored.

        Returns
        -------
        output_sequence : ``PackedSequence``
            The encoded sequence of shape (batch_size, sequence_length, hidden_size)
        final_states: ``torch.Tensor``
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size).
        """
        inputs, lengths = pad_packed_sequence(inputs, batch_first=True)

        # Kernel takes sequence length first tensors.
        inputs = inputs.transpose(0, 1)

        sequence_length, batch_size, _ = inputs.size()
        accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size]
        state_accumulator = inputs.new_zeros(*accumulator_shape)
        memory_accumulator = inputs.new_zeros(*accumulator_shape)

        dropout_weights = inputs.new_ones(self.num_layers, batch_size, self.hidden_size)
        if self.training:
            # Normalize by 1 - dropout_prob to preserve the output statistics of the layer.
            dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\
                .div_((1 - self.recurrent_dropout_probability))

        gates = inputs.new_tensor((self.num_layers, sequence_length, batch_size, 6 * self.hidden_size))

        lengths_variable = torch.LongTensor(lengths)
        implementation = _AlternatingHighwayLSTMFunction(self.input_size,
                                                         self.hidden_size,
                                                         num_layers=self.num_layers,
                                                         train=self.training)
        output, _ = implementation(inputs, self.weight, self.bias, state_accumulator,
                                   memory_accumulator, dropout_weights, lengths_variable, gates)

        # TODO(Mark): Also return the state here by using index_select with the lengths so we can use
        # it as a Seq2VecEncoder.
        output = output.transpose(0, 1)
        output = pack_padded_sequence(output, lengths, batch_first=True)
        return output, None