class VariationalDropout(nn.Module):
    def __init__(self, input_size, out_size, log_sigma2=-10, threshold=3):
        """
        :param input_size: An int of input size
        :param log_sigma2: Initial value of log sigma ^ 2.
               It is crusial for training since it determines initial value of alpha
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_size: An int of output size
        """
        super(VariationalDropout, self).__init__()

        self.input_size = input_size
        self.out_size = out_size

        self.theta = Parameter(t.FloatTensor(input_size, out_size))
        self.bias = Parameter(t.Tensor(out_size))

        self.log_sigma2 = Parameter(t.FloatTensor(input_size, out_size).fill_(log_sigma2))

        self.reset_parameters()

        self.threshold = threshold

    def forward(self, input): # Local Reparameterization Trick
        log_alpha = self.clip(self.log_sigma2 - t.log(self.theta ** 2))
        kld = self.kld(log_alpha)

        if not self.training:
            mask = log_alpha > self.threshold
            return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))

        mu = t.mm(input, self.theta)
        std = t.sqrt(t.mm(input ** 2, self.log_sigma2.exp()) + 1e-6)

        eps = Variable(t.randn(*mu.size()))
        if input.is_cuda:
            eps = eps.cuda()

        return std * eps + mu + self.bias, kld


    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_size)

        self.theta.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    def clip(self, input, to=8):
        input = input.masked_fill(input < -to, -to)
        input = input.masked_fill(input > to, to)

        return input

    def kld(self, log_alpha): # in paper "Variational Dropout Sparsifies Deep Neural Networks"
        k = [0.63576, 1.87320, 1.48695]

        first_term = k[0] * t.sigmoid(k[1] + k[2] * log_alpha)
        second_term = 0.5 * t.log(1 + t.exp(-log_alpha))

        return - (first_term - second_term - k[0]).sum() / (self.input_size * self.out_size)
Beispiel #2
0
class VariationalDropout(nn.Module):
    def __init__(self, input_size, out_size, log_sigma2=-8, threshold=3):
        """
        :param input_size: An int of input size
        :param log_sigma2: Initial value of log sigma ^ 2.
               It is crusial for training since it determines initial value of alpha
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_size: An int of output size
        """
        super(VariationalDropout, self).__init__()

        self.input_size = input_size
        self.out_size = out_size

        self.theta = Parameter(t.FloatTensor(input_size, out_size))
        self.bias = Parameter(t.Tensor(out_size))
        self.prior_theta = 0.
        self.prior_log_sigma2 = -2.
        self.log_sigma2 = Parameter(
            t.FloatTensor(input_size, out_size).fill_(log_sigma2))
        self.sz = input_size * out_size
        self.s = Parameter(t.Tensor([scale]))
        self.code = t.Tensor([0.2, 0, -0.2])

        self.reset_parameters()

        self.k = [0.63576, 1.87320, 1.48695]

        self.threshold = threshold

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_size)

        self.theta.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    @staticmethod
    def clip(self):
        self.log_sigma2.masked_fill(self.log_sigma2 < -10, -10)
        self.log_sigma2.masked_fill(self.log_sigma2 > 1, 1)
        self.theta.data = t.where(
            self.theta < (-0.2 - 0.3679 * t.sqrt(self.log_sigma2.exp())),
            (-0.2 - 0.3679 * t.sqrt(self.log_sigma2.exp())), self.theta)
        self.theta.data = t.where(
            self.theta > (0.2 + 0.3679 * t.sqrt(self.log_sigma2.exp())),
            (0.2 + 0.3679 * t.sqrt(self.log_sigma2.exp())), self.theta)
#         self.theta.masked_fill(self.theta < (-0.2-0.3679*t.sqrt(self.log_sigma2.exp())), (-0.2-0.3679*t.sqrt(self.log_sigma2.exp())))
#         self.theta.masked_fill(self.theta > (0.2+0.3679*t.sqrt(self.log_sigma2.exp())), (0.2+0.3679*t.sqrt(self.log_sigma2.exp())))

    def clip__(input, to=8):
        input = input.masked_fill(input < -to, -to)
        input = input.masked_fill(input > to, to)

        return input


#     def kllu(self,log_alpha):
#         first_term = self.k[0] * F.sigmoid(self.k[1] + self.k[2] * log_alpha)
#         second_term = 0.5 * t.log(1 + t.exp(-log_alpha))
#         return -(first_term - second_term - self.k[0])

    def kld(self, mean, idx):

        window1 = gaussian_window(mean * self.s, 0.2)
        window2 = gaussian_window(mean * self.s, -0.2)

        log_alpha1 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (mean * self.s - 0.2)**2)
        log_alpha2 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (mean * self.s)**2)
        log_alpha3 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (mean * self.s + 0.2)**2)

        F_KLLU1 = kllu(log_alpha1)
        F_KLLU2 = kllu(log_alpha2)
        F_KLLU3 = kllu(log_alpha3)
        #         print(F_KLLU1)
        #         print(F_KLLU2)
        #         print(F_KLLU3)
        #         print(hi)
        F_KL = F_KLLU1 * window1 + F_KLLU3 * window2 + F_KLLU2 * (1 - window1 -
                                                                  window2)
        return F_KL.sum() / (self.sz)

    def forward(self, input, train, noquan):
        """
        :param input: An float tensor with shape of [batch_size, input_size]
        :return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
        """
        self.clip(self)
        c1 = (self.theta * self.s - 0.2)**2
        c2 = (self.theta * self.s)**2
        c3 = (self.theta * self.s + 0.2)**2
        mean = t.min(t.min(c1, c2), c3)
        c = t.stack((c1, c2, c3), 0)
        idx = t.argmin(c, 0)
        #         print(idx)

        if not train and not noquan:
            """
            mask = log_alpha > self.threshold
            return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))
            """
            theta_q = self.theta.data.clone()
            theta_q[:] = self.code[idx].cuda() / self.s
            #             mask = log_alpha > self.threshold

            mu = t.mm(input, theta_q)
            kld = t.sum((theta_q - self.theta)**2)

            return mu + self.bias, kld
        if noquan:
            kld = 0
            """
            mask = log_alpha > self.threshold
            return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))
            """
            theta_q = self.theta.data.clone()
            mu = t.mm(input, theta_q)

            return mu + self.bias, kld
        kld = _kl_loss(self.theta, self.log_sigma2, self.prior_theta,
                       self.prior_log_sigma2) / self.sz
        mu = t.mm(input, self.theta * self.s)
        std = t.sqrt(t.mm(input**2, self.s**2 * self.log_sigma2.exp()) + 1e-6)

        eps = Variable(t.randn(*mu.size()))
        if input.is_cuda:
            eps = eps.cuda()

        return std * eps + mu + self.bias, kld

    def max_alpha(self):
        log_alpha = self.log_sigma2 - self.theta**2
        return t.max(log_alpha.exp())
Beispiel #3
0
class VariationalDropoutCNN(nn.Module):
    def __init__(self,
                 in_channel,
                 out_channel,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 log_sigma2=-8,
                 threshold=3):
        """
        :param input_channel: An int of input channel
        :param log_sigma2: Initial value of log sigma ^ 2.
               It is crusial for training since it determines initial value of alpha
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_channel: An int of output channel
        """
        super(VariationalDropoutCNN, self).__init__()
        #         self.m = img_row
        self.in_channel = in_channel
        self.out_channel = out_channel
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups

        self.theta = Parameter(
            t.Tensor(out_channel, in_channel // groups, kernel_size,
                     kernel_size))
        self.prior_theta = 0.
        self.prior_log_sigma2 = -2.
        #         self.bias = Parameter(t.Tensor(out_channel, in_channel // groups, kernel_size, kernel_size))
        #         self.bias = Parameter(t.Tensor(out_channel, self.m-kernel_size+1, self.m-kernel_size+1))
        self.sz = out_channel * (in_channel // groups) * kernel_size**2
        self.log_sigma2 = Parameter(
            t.FloatTensor(out_channel, in_channel // groups, kernel_size,
                          kernel_size).fill_(log_sigma2))
        self.s = Parameter(t.Tensor([scale]))
        self.code = t.Tensor([0.2, 0, -0.2])

        self.reset_parameters()

        self.k = [0.63576, 1.87320, 1.48695]

        self.threshold = threshold

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_channel)

        self.theta.data.uniform_(-stdv, stdv)
#         self.bias.data.uniform_(-stdv, stdv)

    @staticmethod
    def clip_logsig(input):
        input = input.masked_fill(input < -10, -10)
        input = input.masked_fill(input > 1, 1)

        return input

    def clip(self):
        self.log_sigma2.masked_fill(self.log_sigma2 < -10, -10)
        self.log_sigma2.masked_fill(self.log_sigma2 > 1, 1)
        self.theta.data = t.where(
            self.theta < (-0.2 - 0.3679 * t.sqrt(self.log_sigma2.exp())),
            (-0.2 - 0.3679 * t.sqrt(self.log_sigma2.exp())), self.theta)
        self.theta.data = t.where(
            self.theta > (0.2 + 0.3679 * t.sqrt(self.log_sigma2.exp())),
            (0.2 + 0.3679 * t.sqrt(self.log_sigma2.exp())), self.theta)
#         self.theta.masked_fill(self.theta < (-0.2-0.3679*t.sqrt(self.log_sigma2.exp())), (-0.2-0.3679*t.sqrt(self.log_sigma2.exp())))
#         self.theta.masked_fill(self.theta > (0.2+0.3679*t.sqrt(self.log_sigma2.exp())), (0.2+0.3679*t.sqrt(self.log_sigma2.exp())))

    def kld(self, idx):

        window1 = gaussian_window(self.theta * self.s, 0.2)
        window2 = gaussian_window(self.theta * self.s, -0.2)

        log_alpha1 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (self.theta * self.s - 0.2)**2)
        log_alpha2 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (self.theta * self.s)**2)
        log_alpha3 = self.log_sigma2 + 2 * t.log(self.s) - t.log(
            (self.theta * self.s + 0.2)**2)

        F_KLLU1 = kllu(log_alpha1)
        F_KLLU2 = kllu(log_alpha2)
        F_KLLU3 = kllu(log_alpha3)
        F_KL = F_KLLU1 * window1 + F_KLLU3 * window2 + F_KLLU2 * (1 - window1 -
                                                                  window2)
        return F_KL.sum() / (self.sz)

    def forward(self, input, train, noquan):
        """
        :param input: An float tensor with shape of [batch_size, input_size]
        :return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
        """
        self.clip()
        c1 = (self.theta * self.s - 0.2)**2
        c2 = (self.theta * self.s)**2
        c3 = (self.theta * self.s + 0.2)**2
        mean = t.min(t.min(c1, c2), c3)
        c = t.stack((c1, c2, c3), 0)
        idx = t.argmin(c, 0)

        if not train and not noquan:
            """
            mask = log_alpha > self.threshold
            return F.conv2d( input, weight = self.theta.masked_fill(mask, 0), stride=self.stride, 
                          padding=self.padding,dilation=self.dilation, groups=self.groups)
            """

            theta_q = self.theta.data.clone()

            theta_q[:] = self.code[idx].cuda() / self.s
            mu = F.conv2d(input,
                          weight=theta_q,
                          stride=self.stride,
                          padding=self.padding,
                          dilation=self.dilation,
                          groups=self.groups)

            kld = t.sum((theta_q - self.theta)**2)
            return mu, kld  #+self.bias , kld
        if noquan:
            kld = 0
            theta_q = self.theta.data.clone()
            mu = F.conv2d(input,
                          weight=theta_q,
                          stride=self.stride,
                          padding=self.padding,
                          dilation=self.dilation,
                          groups=self.groups)

            return mu, kld  #+self.bias , kld
        kld = _kl_loss(self.theta, self.log_sigma2, self.prior_theta,
                       self.prior_log_sigma2) / self.sz
        mu = F.conv2d(input,
                      weight=self.theta * self.s,
                      stride=self.stride,
                      padding=self.padding,
                      dilation=self.dilation,
                      groups=self.groups)
        std = t.sqrt(
            F.conv2d(input**2,
                     weight=self.log_sigma2.exp() * self.s**2,
                     stride=self.stride,
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups) + 1e-6)

        eps = Variable(t.randn(*mu.size()))
        if input.is_cuda:
            eps = eps.cuda()
        return std * eps + mu, kld  # + self.bias , kld

    def max_alpha(self):
        log_alpha = self.log_sigma2 - (self.theta - 0.2)**2
        return t.max(log_alpha.exp())
class VariationalDropout(nn.Module):
    def __init__(self, input_size, out_size, log_sigma2=-10, threshold=3):
        """
        :param input_size: An int of input size
        :param log_sigma2: Initial value of log_sigma^2 (crucial for training as it determines initial value of alpha)
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_size: An int of output size
        """
        super(VariationalDropout, self).__init__()

        self.input_size = input_size
        self.out_size = out_size

        self.theta = Parameter(torch.FloatTensor(input_size, out_size))
        self.bias = Parameter(torch.Tensor(out_size))

        self.log_sigma2 = Parameter(
            torch.FloatTensor(input_size, out_size).fill_(log_sigma2))

        self.reset_parameters()

        self.k = [0.63576, 1.87320, 1.48695]

        self.threshold = threshold

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_size)
        self.theta.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    @staticmethod
    def clip(input, to=8):
        input = input.masked_fill(input < -to, -to)
        input = input.masked_fill(input > to, to)
        return input

    def kld(self, log_alpha):
        first_term = self.k[0] * F.sigmoid(self.k[1] + self.k[2] * log_alpha)
        second_term = 0.5 * torch.log(1 + torch.exp(-log_alpha))
        return -(first_term - second_term - self.k[0]).sum() / (
            self.input_size * self.out_size)

    def forward(self, input):
        """
        :param input: An float tensor with shape of [batch_size, input_size]
        :return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
        """

        log_alpha = self.clip(self.log_sigma2 - torch.log(self.theta**2))
        kld = self.kld(log_alpha)

        if not self.training:
            mask = log_alpha > self.threshold
            return torch.addmm(self.bias, input,
                               self.theta.masked_fill(mask, 0))

        mu = torch.mm(input, self.theta)
        std = torch.sqrt(torch.mm(input**2, self.log_sigma2.exp()) + 1e-6)

        eps = Variable(torch.randn(*mu.size()))
        if input.is_cuda:
            eps = eps.cuda()

        return std * eps + mu + self.bias, kld

    def max_alpha(self):
        log_alpha = self.log_sigma2 - self.theta**2
        return torch.max(log_alpha.exp())
class VariationalDropout(nn.Module):
    def __init__(self, input_size, out_size, log_sigma2=-10, threshold=3):
        """
        :param input_size: An int of input size
        :param log_sigma2: Initial value of log sigma ^ 2.
               It is crusial for training since it determines initial value of alpha
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_size: An int of output size
        """
        super(VariationalDropout, self).__init__()

        self.input_size = input_size
        self.out_size = out_size

        self.theta = Parameter(t.FloatTensor(input_size, out_size))
        self.bias = Parameter(t.Tensor(out_size))

        self.log_sigma2 = Parameter(
            t.FloatTensor(input_size, out_size).fill_(log_sigma2))

        self.reset_parameters()

        self.k = [0.63576, 1.87320, 1.48695]

        self.threshold = threshold

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_size)

        self.theta.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    @staticmethod
    def clip(input, to=8.):
        input = input.masked_fill(input < -to, -to)
        input = input.masked_fill(input > to, to)

        return input

    def kld(self, log_alpha):

        first_term = self.k[0] * F.sigmoid(self.k[1] + self.k[2] * log_alpha)
        second_term = 0.5 * t.log(1 + t.exp(-log_alpha))
        return (first_term - second_term -
                self.k[0]).sum() / (self.input_size * self.out_size)

    def forward(self, input, train):
        """
        :param input: An float tensor with shape of [batch_size, input_size]
        :return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
        """
        log_alpha = self.clip(self.log_sigma2 - t.log(self.theta**2))
        fh = open("log_alpha_values_during_training.txt", 'a')
        fh.write(
            str(self.input_size) + "||||" + str(log_alpha.data.numpy()[0][0]) +
            "\n")
        fh.close()
        #print(log_alpha.data.numpy()[0][0])
        kld = self.kld(log_alpha)

        if not train:
            mask = log_alpha > self.threshold
            if (t.nonzero(mask).dim() != 0):
                zeroed_weights = t.nonzero(mask).size(0)

            else:
                zeroed_weights = 0

            total_weights = mask.size(0) * mask.size(1)
            print('number of zeroed weights is {}'.format(zeroed_weights))
            print('total numer of weights is {}'.format(total_weights))
            print('ratio for non zeroed weights is {}'.format(
                (total_weights - zeroed_weights) / total_weights))
            return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))

        mu = t.mm(input, self.theta)
        std = t.sqrt(t.mm(input**2, self.log_sigma2.exp()) + 1e-6)

        eps = Variable(t.randn(*mu.size()))
        if input.is_cuda:
            eps = eps.cuda()

        return std * eps + mu + self.bias, kld

    def max_alpha(self):
        log_alpha = self.log_sigma2 - self.theta**2
        return t.max(log_alpha)
Beispiel #6
0
class VariationalDropout(nn.Module):
    def __init__(self, input_size, out_size, log_sigma2=-10, threshold=3):
        """
        This module create a fully connected layer with variational dropout enabled
        
        :param input_size: An int of input size
        :param log_sigma2: Initial value of log sigma ^ 2.
               It is crucial for training since it determines initial value of alpha
        :param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
        :param out_size: An int of output size
        """
        super(VariationalDropout, self).__init__()

        self.input_size = input_size
        self.out_size = out_size

        self.theta = Parameter(t.FloatTensor(
            input_size, out_size))  # fully connected weight
        self.bias = Parameter(t.Tensor(out_size))  # bias

        self.log_sigma2 = Parameter(
            t.FloatTensor(input_size, out_size).fill_(
                log_sigma2))  # the Gaussian noise sample iid w.r.t each weight

        self.reset_parameters()

        self.k = [0.63576, 1.87320, 1.48695]

        self.threshold = threshold  # as it said, this is used for zero the weight if the Gaussian noise ball has too large radius.

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.out_size)

        self.theta.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    @staticmethod
    def clip(input, to=8):
        input = input.masked_fill(input < -to, -to)
        input = input.masked_fill(input > to, to)

        return input

    def kld(self, log_alpha):

        first_term = self.k[0] * F.sigmoid(self.k[1] + self.k[2] * log_alpha)
        second_term = 0.5 * t.log(1 + t.exp(-log_alpha))

        return -(first_term - second_term - self.k[0]).sum() / (
            self.input_size * self.out_size)

    def forward(self, input):
        """
        :param input: An float tensor with shape of [batch_size, input_size]
        :return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
        """

        log_alpha = self.clip(self.log_sigma2 - t.log(self.theta**2))
        kld = self.kld(log_alpha)

        if not self.training:
            mask = log_alpha > self.threshold
            return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))

        mu = t.mm(input, self.theta)
        std = t.sqrt(t.mm(input**2, self.log_sigma2.exp()) + 1e-6)

        eps = Variable(
            t.randn(*mu.size()))  # sample from standard normal distribution
        if input.is_cuda:
            eps = eps.cuda()

        return std * eps + mu + self.bias, kld  # a reparameterization trick to form the Gaussian dropout

    def max_alpha(self):
        log_alpha = self.log_sigma2 - self.theta**2
        return t.max(log_alpha.exp())