Ejemplo n.º 1
0
 def init_hidden(self, hidden_dim):
     """Trainable initial hidden state"""
     enc_init_hx = Parameter(torch.zeros(hidden_dim), requires_grad=False)
     if self.use_cuda:
         enc_init_hx = enc_init_hx.cuda()
     enc_init_cx = Parameter(torch.zeros(hidden_dim), requires_grad=False)
     if self.use_cuda:
         enc_init_cx = enc_init_cx.cuda()
     return enc_init_hx, enc_init_cx
Ejemplo n.º 2
0
    def forward(self, y):
        # Use latent_y = (batch, num_hidden) as input to predict a sequence of ingredient words
        # y has size (batch,num_hidden)

        h_de = []  # store the output hidden vectors of gru
        gru_predicts = []  # store the predicts of gru for words

        h0_de = Parameter(
            torch.zeros((1, y.shape[0], self.num_glove), requires_grad=True))
        # if self.h0_en is None:
        #    self.init_hidden(torch.zeros((1, y.shape[0], self.num_glove), requires_grad=True))
        current_input = torch.cat(
            [y,
             torch.zeros(y.shape[0], self.num_glove).cuda(set_gpu_others)],
            1).unsqueeze(0)  # (1, batch, num_hidden+num_glove)
        current_input = self.hiddenMap2(
            self.relu(self.hiddenMap1(current_input)))
        # print('current_input: {}'.format(current_input.shape))
        prev_hidden = h0_de.cuda(set_gpu_others)
        # print('prev_hidden: {}'.format(prev_hidden.shape))

        for i in range(0, self.seq):  # for each of the max_seq for decoder
            # NOTE: current_hidden = prev_hidden, we use different notations to clarify their roles
            current_hidden, prev_hidden = self.gruLoop(current_input,
                                                       prev_hidden)
            # save gru output
            h_de.append(current_hidden)
            # compute next input to gru, the glove embedding vector of the current predicted word
            current_input, wordPredicts = self.getNextInput(y, current_hidden)

            gru_predicts.append(wordPredicts)

        return torch.cat(gru_predicts, 0), torch.cat(
            h_de, 0)  # make it a tensor (seq, batch, num_word)
Ejemplo n.º 3
0
    def init_hidden(self, hidden_dim):
        """Trainable initial hidden state"""
        enc_init_hx = Parameter(torch.zeros(hidden_dim), requires_grad=False)
        if self.use_cuda:
            enc_init_hx = enc_init_hx.cuda()

        # enc_init_hx = Parameter(enc_init_hx, requires_grad=True)
        # enc_init_hx.uniform_(-(1. / math.sqrt(hidden_dim)), 1. / math.sqrt(hidden_dim))

        enc_init_cx = Parameter(torch.zeros(hidden_dim), requires_grad=False)
        if self.use_cuda:
            enc_init_cx = enc_init_cx.cuda()

        # enc_init_cx = nn.Parameter(enc_init_cx, requires_grad=True)
        # enc_init_cx.uniform_(-(1. / math.sqrt(hidden_dim)), 1. / math.sqrt(hidden_dim))
        return enc_init_hx, enc_init_cx
Ejemplo n.º 4
0
 def init_mask(self, size_0, size_1, input_length):
     mask = Parameter(torch.ones(1), requires_grad=False)
     mask = mask.repeat(size_1).unsqueeze(0).repeat(size_0, 1)
     #for i in range(input_length)
     input_index = list(range(input_length))
     for i in range(size_0):
         mask[i][input_index] = 0
     #print (mask)
     mask = mask.byte()
     mask = mask.cuda()
     return mask
Ejemplo n.º 5
0
    def forward(self, y):
        # compute latent vectors
        # indexVector, num_words_per_data, word_label = getIndexVector(y, self.max_seq)
        # indexVector = torch.from_numpy(indexVector).long().cuda(3)
        embed_vector = self.embedding(y)
        embed_vector = embed_vector.permute(1, 0, 2)

        # obtain gru output of hidden vectors
        h0_en = Parameter(torch.zeros((1, y.shape[0], self.num_hidden), requires_grad=True))
        self.gru.flatten_parameters()
        y_embeds, _ = self.gru(embed_vector, h0_en.cuda(3))

        att_y_embeds, multi_attention = self.getAttention(y_embeds)

        return att_y_embeds, multi_attention, y_embeds, embed_vector
Ejemplo n.º 6
0
    def forward(self, y):
        # compute latent vectors
        encoder_t_embeds = self.embedding(y)
        encoder_t_embeds = encoder_t_embeds.permute(1, 0, 2)

        # obtain gru output of hidden vectors
        h0_en = Parameter(
            torch.zeros((1, y.shape[0], self.num_hidden), requires_grad=True))
        if self.CUDA:
            h0_en = h0_en.cuda()
        y_embeds, _ = self.gru(encoder_t_embeds, h0_en)

        att_y_embeds, multi_attention = self.getAttention(y_embeds)

        return att_y_embeds, encoder_t_embeds, multi_attention
Ejemplo n.º 7
0
class ArcCos(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.50, bias=False):
        super(ArcCos, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)

        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight.cuda()))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output
Ejemplo n.º 8
0
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance:
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
        """
    def __init__(self, in_features, out_features):
        super(ArcMarginProduct, self).__init__()
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        # nn.init.xavier_uniform_(self.weight)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)

    def forward(self, features):
        cosine = F.linear(F.normalize(features),
                          F.normalize(self.weight.cuda()))
        return cosine
class PriorDistribution(nn.Module):
    def __init__(self, A, K, alpha, GPU=False):
        super(PriorDistribution, self).__init__()
        self.A = torch.tensor(A, dtype=torch.float)
        self.K = K
        self.alpha = alpha
        N = self.A.shape[0]
        self.prior_distribution_matrix = Parameter(data=torch.randn(
            size=[N, self.K], dtype=torch.float),
                                                   requires_grad=True)
        if GPU:
            self.A = self.A.cuda()
            self.prior_distribution_matrix = self.prior_distribution_matrix.cuda(
            )

    def forward(self):
        difference = self.A - torch.matmul(self.prior_distribution_matrix,
                                           self.prior_distribution_matrix.t())
        difference = torch.norm(difference, p=2)
        regular = torch.norm(self.prior_distribution_matrix, p=2)
        loss = difference + self.alpha * regular
        return loss
Ejemplo n.º 10
0
class ExclusiveLinear(nn.Module):
    r"""Implement of ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf):
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
            s: norm of input feature
            m: margin
            cos(theta+m)
        """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.50, easy_margin=False):
        super(ExclusiveLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id

        self.s = s
        self.m = m

        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
            weight_norm = F.normalize(self.weight)
            cos = torch.mm(weight_norm, weight_norm.t())

            cos.clamp(-1, 1)
            cos1 = cos.detach()
            cos1.scatter_(1, torch.arange(self.out_features).view(-1, 1).long().cuda(self.device_id[0]), -100)
            max_cos, indices = torch.max(cos1, dim=1)
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))

            temp_weight = self.weight.cuda(self.device_id[0])
            cos = torch.mm(F.normalize(weight), F.normalize(temp_weight).t())

            cos.clamp(-1, 1)
            cos1 = cos.detach()
            length = weight.size()[0]
            cos1.scatter_(1, torch.arange(length).view(-1, 1).long().cuda(self.device_id[0]), -100)
            max_cos, indices = torch.max(cos1, dim=1)

            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)

                temp_weight = self.weight.cuda(self.device_id[i])
                # cos = torch.cat((cos, torch.mm(F.normalize(weight_transform), F.normalize(temp_weight).t()).cuda(self.device_id[0])), dim=0)
                cos = torch.mm(F.normalize(weight), F.normalize(temp_weight).t())

                cos.clamp(-1, 1)
                cos1 = cos.detach()
                length = weight.size()[0]
                cos1.scatter_(1, torch.arange(length).view(-1, 1).long().cuda(self.device_id[i]), -100)

                max_cos_, indices = torch.max(cos1, dim=1)
                max_cos = torch.cat((max_cos, max_cos_.cuda(self.device_id[0])), dim=0)

        exclusive_loss = torch.sum(max_cos) / self.out_features

        # cos1.scatter_(1, torch.arange(self.out_features).view(-1, 1).long().cuda(self.device_id[0]), -100)
        # mask = torch.zeros((self.out_features, self.out_features)).cuda(self.device_id[0])
        # mask.scatter_(1, indices.view(-1, 1).long(), 1)
        #
        # exclusive_loss = torch.dot(cos.view(cos.numel()), mask.view(mask.numel())) / self.out_features

        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)

        theta = torch.acos(torch.clamp(cosine, -1.0 + 1e-7, 1.0 - 1e-7))
        with torch.no_grad():
            # B_avg = torch.where(one_hot < 1, torch.exp(self.s * cosine), torch.zeros_like(cosine))
            B_avg_ = cosine[one_hot != 1]
            B_avg = torch.sum(torch.exp(self.s * B_avg_)) / input.size(0)
            # print(B_avg)
            theta_med = torch.median(theta[one_hot == 1])
            theta_sum = torch.sum(theta[one_hot != 1])

        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4

        print("=" * 60)
        print("s={} theta_med={} theta_sum={} B_avg={}".format(self.s, theta_med, theta_sum, B_avg))
        print("=" * 60)

        output *= self.s

        return output, exclusive_loss
Ejemplo n.º 11
0
class BBBLinearFactorial(nn.Module):
    """
    Describes a Linear fully connected Bayesian layer with
    a distribution over each of the weights and biases
    in the layer.
    """
    def __init__(self,
                 in_features,
                 out_features,
                 p_logvar_init=-3,
                 p_pi=1.0,
                 q_logvar_init=-5):
        # p_logvar_init, p_pi can be either
        # (list/tuples): prior model is a mixture of Gaussians components=len(p_pi)=len(p_logvar_init)
        # float: Gussian distribution
        # q_logvar_init: float, the approximate posterior is currently always a factorized gaussian
        super(BBBLinearFactorial, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.p_logvar_init = p_logvar_init
        self.q_logvar_init = q_logvar_init

        # Approximate posterior weights...
        self.qw_mean = Parameter(torch.Tensor(out_features, in_features))
        self.qw_logvar = Parameter(torch.Tensor(out_features, in_features))

        # optionally add bias
        # self.qb_mean = Parameter(torch.Tensor(out_features))
        # self.qb_logvar = Parameter(torch.Tensor(out_features))

        # ...and output...
        self.fc_qw_mean = Parameter(torch.Tensor(out_features, in_features))
        self.fc_qw_std = Parameter(torch.Tensor(out_features, in_features))

        # ...as normal distributions
        self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar)
        # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar)
        self.fc_qw = Normalout(mu=self.fc_qw_mean, std=self.fc_qw_std)

        # initialise
        self.log_alpha = Parameter(torch.Tensor(1, 1))

        # prior model
        self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)
        # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi)

        # initialize all paramaters
        self.reset_parameters()

    def reset_parameters(self):
        # initialize (trainable) approximate posterior parameters
        stdv = 10.0 / math.sqrt(self.in_features)
        self.qw_mean.data.uniform_(-stdv, stdv)
        self.qw_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        # self.qb_mean.data.uniform_(-stdv, stdv)
        # self.qb_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        self.fc_qw_mean.data.uniform_(-stdv, stdv)
        self.fc_qw_std.data.uniform_(-stdv, stdv).add_(self.q_logvar_init)
        self.log_alpha.data.uniform_(-stdv, stdv)

    def forward(self, input):
        raise NotImplementedError()

    def fcprobforward(self, input):
        """
        Probabilistic forwarding method.
        :param input: data tensor
        :return: output, kl-divergence
        """
        if cuda:
            input = input.cuda()
            qw_mean = self.qw_mean.cuda()
            log_alpha = self.log_alpha.cuda()
        else:
            input = input
            qw_mean = self.qw_mean
            log_alpha = self.log_alpha

        fc_qw_mean = F.linear(input=input, weight=qw_mean)
        fc_qw_si = torch.sqrt(1e-8 + F.linear(
            input=input.pow(2), weight=torch.exp(log_alpha) * qw_mean.pow(2)))

        if cuda:
            fc_qw_mean = fc_qw_mean.cuda()
            fc_qw_si = fc_qw_si.cuda()

        # sample from output
        if cuda:
            output = fc_qw_mean + fc_qw_si * torch.randn(
                fc_qw_mean.size()).cuda()
        else:
            output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size()))

        w_sample = self.fc_qw.sample()

        # KL divergence
        qw_logpdf = self.fc_qw.logpdf(w_sample)

        kl = torch.sum(qw_logpdf - self.pw.logpdf(w_sample))

        if cuda:
            output = output.cuda()
            kl = kl.cuda()

        return output, kl

    def __repr__(self):
        return (self.__class__.__name__ + " (" + str(self.in_features) +
                " -> " + str(self.out_features) + ")")