def init_hidden(self, hidden_dim): """Trainable initial hidden state""" enc_init_hx = Parameter(torch.zeros(hidden_dim), requires_grad=False) if self.use_cuda: enc_init_hx = enc_init_hx.cuda() enc_init_cx = Parameter(torch.zeros(hidden_dim), requires_grad=False) if self.use_cuda: enc_init_cx = enc_init_cx.cuda() return enc_init_hx, enc_init_cx
def forward(self, y): # Use latent_y = (batch, num_hidden) as input to predict a sequence of ingredient words # y has size (batch,num_hidden) h_de = [] # store the output hidden vectors of gru gru_predicts = [] # store the predicts of gru for words h0_de = Parameter( torch.zeros((1, y.shape[0], self.num_glove), requires_grad=True)) # if self.h0_en is None: # self.init_hidden(torch.zeros((1, y.shape[0], self.num_glove), requires_grad=True)) current_input = torch.cat( [y, torch.zeros(y.shape[0], self.num_glove).cuda(set_gpu_others)], 1).unsqueeze(0) # (1, batch, num_hidden+num_glove) current_input = self.hiddenMap2( self.relu(self.hiddenMap1(current_input))) # print('current_input: {}'.format(current_input.shape)) prev_hidden = h0_de.cuda(set_gpu_others) # print('prev_hidden: {}'.format(prev_hidden.shape)) for i in range(0, self.seq): # for each of the max_seq for decoder # NOTE: current_hidden = prev_hidden, we use different notations to clarify their roles current_hidden, prev_hidden = self.gruLoop(current_input, prev_hidden) # save gru output h_de.append(current_hidden) # compute next input to gru, the glove embedding vector of the current predicted word current_input, wordPredicts = self.getNextInput(y, current_hidden) gru_predicts.append(wordPredicts) return torch.cat(gru_predicts, 0), torch.cat( h_de, 0) # make it a tensor (seq, batch, num_word)
def init_hidden(self, hidden_dim): """Trainable initial hidden state""" enc_init_hx = Parameter(torch.zeros(hidden_dim), requires_grad=False) if self.use_cuda: enc_init_hx = enc_init_hx.cuda() # enc_init_hx = Parameter(enc_init_hx, requires_grad=True) # enc_init_hx.uniform_(-(1. / math.sqrt(hidden_dim)), 1. / math.sqrt(hidden_dim)) enc_init_cx = Parameter(torch.zeros(hidden_dim), requires_grad=False) if self.use_cuda: enc_init_cx = enc_init_cx.cuda() # enc_init_cx = nn.Parameter(enc_init_cx, requires_grad=True) # enc_init_cx.uniform_(-(1. / math.sqrt(hidden_dim)), 1. / math.sqrt(hidden_dim)) return enc_init_hx, enc_init_cx
def init_mask(self, size_0, size_1, input_length): mask = Parameter(torch.ones(1), requires_grad=False) mask = mask.repeat(size_1).unsqueeze(0).repeat(size_0, 1) #for i in range(input_length) input_index = list(range(input_length)) for i in range(size_0): mask[i][input_index] = 0 #print (mask) mask = mask.byte() mask = mask.cuda() return mask
def forward(self, y): # compute latent vectors # indexVector, num_words_per_data, word_label = getIndexVector(y, self.max_seq) # indexVector = torch.from_numpy(indexVector).long().cuda(3) embed_vector = self.embedding(y) embed_vector = embed_vector.permute(1, 0, 2) # obtain gru output of hidden vectors h0_en = Parameter(torch.zeros((1, y.shape[0], self.num_hidden), requires_grad=True)) self.gru.flatten_parameters() y_embeds, _ = self.gru(embed_vector, h0_en.cuda(3)) att_y_embeds, multi_attention = self.getAttention(y_embeds) return att_y_embeds, multi_attention, y_embeds, embed_vector
def forward(self, y): # compute latent vectors encoder_t_embeds = self.embedding(y) encoder_t_embeds = encoder_t_embeds.permute(1, 0, 2) # obtain gru output of hidden vectors h0_en = Parameter( torch.zeros((1, y.shape[0], self.num_hidden), requires_grad=True)) if self.CUDA: h0_en = h0_en.cuda() y_embeds, _ = self.gru(encoder_t_embeds, h0_en) att_y_embeds, multi_attention = self.getAttention(y_embeds) return att_y_embeds, encoder_t_embeds, multi_attention
class ArcCos(nn.Module): def __init__(self, in_features, out_features, s=30.0, m=0.50, bias=False): super(ArcCos, self).__init__() self.in_features = in_features self.out_features = out_features self.s = s self.m = m self.cos_m = math.cos(m) self.sin_m = math.sin(m) self.th = math.cos(math.pi - m) self.mm = math.sin(math.pi - m) * m self.weight = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(out_features)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) nn.init.uniform_(self.bias, -bound, bound) def forward(self, input, label): cosine = F.linear(F.normalize(input), F.normalize(self.weight.cuda())) sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1)) phi = cosine * self.cos_m - sine * self.sin_m phi = torch.where(cosine > self.th, phi, cosine - self.mm) # --------------------------- convert label to one-hot --------------------------- # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda') one_hot = torch.zeros(cosine.size(), device='cuda') one_hot.scatter_(1, label.view(-1, 1).long(), 1) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ( (1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 output *= self.s # print(output) return output
class ArcMarginProduct(nn.Module): r"""Implement of large margin arc distance: Args: in_features: size of each input sample out_features: size of each output sample """ def __init__(self, in_features, out_features): super(ArcMarginProduct, self).__init__() self.weight = Parameter(torch.FloatTensor(out_features, in_features)) # nn.init.xavier_uniform_(self.weight) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.weight.size(1)) self.weight.data.uniform_(-stdv, stdv) def forward(self, features): cosine = F.linear(F.normalize(features), F.normalize(self.weight.cuda())) return cosine
class PriorDistribution(nn.Module): def __init__(self, A, K, alpha, GPU=False): super(PriorDistribution, self).__init__() self.A = torch.tensor(A, dtype=torch.float) self.K = K self.alpha = alpha N = self.A.shape[0] self.prior_distribution_matrix = Parameter(data=torch.randn( size=[N, self.K], dtype=torch.float), requires_grad=True) if GPU: self.A = self.A.cuda() self.prior_distribution_matrix = self.prior_distribution_matrix.cuda( ) def forward(self): difference = self.A - torch.matmul(self.prior_distribution_matrix, self.prior_distribution_matrix.t()) difference = torch.norm(difference, p=2) regular = torch.norm(self.prior_distribution_matrix, p=2) loss = difference + self.alpha * regular return loss
class ExclusiveLinear(nn.Module): r"""Implement of ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf): Args: in_features: size of each input sample out_features: size of each output sample device_id: the ID of GPU where the model will be trained by model parallel. if device_id=None, it will be trained on CPU without model parallel. s: norm of input feature m: margin cos(theta+m) """ def __init__(self, in_features, out_features, device_id, s=64.0, m=0.50, easy_margin=False): super(ExclusiveLinear, self).__init__() self.in_features = in_features self.out_features = out_features self.device_id = device_id self.s = s self.m = m self.weight = Parameter(torch.FloatTensor(out_features, in_features)) nn.init.xavier_uniform_(self.weight) self.easy_margin = easy_margin self.cos_m = math.cos(m) self.sin_m = math.sin(m) self.th = math.cos(math.pi - m) self.mm = math.sin(math.pi - m) * m def forward(self, input, label): # --------------------------- cos(theta) & phi(theta) --------------------------- if self.device_id == None: cosine = F.linear(F.normalize(input), F.normalize(self.weight)) weight_norm = F.normalize(self.weight) cos = torch.mm(weight_norm, weight_norm.t()) cos.clamp(-1, 1) cos1 = cos.detach() cos1.scatter_(1, torch.arange(self.out_features).view(-1, 1).long().cuda(self.device_id[0]), -100) max_cos, indices = torch.max(cos1, dim=1) else: x = input sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) cosine = F.linear(F.normalize(temp_x), F.normalize(weight)) temp_weight = self.weight.cuda(self.device_id[0]) cos = torch.mm(F.normalize(weight), F.normalize(temp_weight).t()) cos.clamp(-1, 1) cos1 = cos.detach() length = weight.size()[0] cos1.scatter_(1, torch.arange(length).view(-1, 1).long().cuda(self.device_id[0]), -100) max_cos, indices = torch.max(cos1, dim=1) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) temp_weight = self.weight.cuda(self.device_id[i]) # cos = torch.cat((cos, torch.mm(F.normalize(weight_transform), F.normalize(temp_weight).t()).cuda(self.device_id[0])), dim=0) cos = torch.mm(F.normalize(weight), F.normalize(temp_weight).t()) cos.clamp(-1, 1) cos1 = cos.detach() length = weight.size()[0] cos1.scatter_(1, torch.arange(length).view(-1, 1).long().cuda(self.device_id[i]), -100) max_cos_, indices = torch.max(cos1, dim=1) max_cos = torch.cat((max_cos, max_cos_.cuda(self.device_id[0])), dim=0) exclusive_loss = torch.sum(max_cos) / self.out_features # cos1.scatter_(1, torch.arange(self.out_features).view(-1, 1).long().cuda(self.device_id[0]), -100) # mask = torch.zeros((self.out_features, self.out_features)).cuda(self.device_id[0]) # mask.scatter_(1, indices.view(-1, 1).long(), 1) # # exclusive_loss = torch.dot(cos.view(cos.numel()), mask.view(mask.numel())) / self.out_features sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0, phi, cosine) else: phi = torch.where(cosine > self.th, phi, cosine - self.mm) # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cosine.size()) if self.device_id != None: one_hot = one_hot.cuda(self.device_id[0]) one_hot.scatter_(1, label.view(-1, 1).long(), 1) theta = torch.acos(torch.clamp(cosine, -1.0 + 1e-7, 1.0 - 1e-7)) with torch.no_grad(): # B_avg = torch.where(one_hot < 1, torch.exp(self.s * cosine), torch.zeros_like(cosine)) B_avg_ = cosine[one_hot != 1] B_avg = torch.sum(torch.exp(self.s * B_avg_)) / input.size(0) # print(B_avg) theta_med = torch.median(theta[one_hot == 1]) theta_sum = torch.sum(theta[one_hot != 1]) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ( (1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 print("=" * 60) print("s={} theta_med={} theta_sum={} B_avg={}".format(self.s, theta_med, theta_sum, B_avg)) print("=" * 60) output *= self.s return output, exclusive_loss
class BBBLinearFactorial(nn.Module): """ Describes a Linear fully connected Bayesian layer with a distribution over each of the weights and biases in the layer. """ def __init__(self, in_features, out_features, p_logvar_init=-3, p_pi=1.0, q_logvar_init=-5): # p_logvar_init, p_pi can be either # (list/tuples): prior model is a mixture of Gaussians components=len(p_pi)=len(p_logvar_init) # float: Gussian distribution # q_logvar_init: float, the approximate posterior is currently always a factorized gaussian super(BBBLinearFactorial, self).__init__() self.in_features = in_features self.out_features = out_features self.p_logvar_init = p_logvar_init self.q_logvar_init = q_logvar_init # Approximate posterior weights... self.qw_mean = Parameter(torch.Tensor(out_features, in_features)) self.qw_logvar = Parameter(torch.Tensor(out_features, in_features)) # optionally add bias # self.qb_mean = Parameter(torch.Tensor(out_features)) # self.qb_logvar = Parameter(torch.Tensor(out_features)) # ...and output... self.fc_qw_mean = Parameter(torch.Tensor(out_features, in_features)) self.fc_qw_std = Parameter(torch.Tensor(out_features, in_features)) # ...as normal distributions self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar) # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar) self.fc_qw = Normalout(mu=self.fc_qw_mean, std=self.fc_qw_std) # initialise self.log_alpha = Parameter(torch.Tensor(1, 1)) # prior model self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # initialize all paramaters self.reset_parameters() def reset_parameters(self): # initialize (trainable) approximate posterior parameters stdv = 10.0 / math.sqrt(self.in_features) self.qw_mean.data.uniform_(-stdv, stdv) self.qw_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) # self.qb_mean.data.uniform_(-stdv, stdv) # self.qb_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) self.fc_qw_mean.data.uniform_(-stdv, stdv) self.fc_qw_std.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) self.log_alpha.data.uniform_(-stdv, stdv) def forward(self, input): raise NotImplementedError() def fcprobforward(self, input): """ Probabilistic forwarding method. :param input: data tensor :return: output, kl-divergence """ if cuda: input = input.cuda() qw_mean = self.qw_mean.cuda() log_alpha = self.log_alpha.cuda() else: input = input qw_mean = self.qw_mean log_alpha = self.log_alpha fc_qw_mean = F.linear(input=input, weight=qw_mean) fc_qw_si = torch.sqrt(1e-8 + F.linear( input=input.pow(2), weight=torch.exp(log_alpha) * qw_mean.pow(2))) if cuda: fc_qw_mean = fc_qw_mean.cuda() fc_qw_si = fc_qw_si.cuda() # sample from output if cuda: output = fc_qw_mean + fc_qw_si * torch.randn( fc_qw_mean.size()).cuda() else: output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size())) w_sample = self.fc_qw.sample() # KL divergence qw_logpdf = self.fc_qw.logpdf(w_sample) kl = torch.sum(qw_logpdf - self.pw.logpdf(w_sample)) if cuda: output = output.cuda() kl = kl.cuda() return output, kl def __repr__(self): return (self.__class__.__name__ + " (" + str(self.in_features) + " -> " + str(self.out_features) + ")")