def forward(self, input, label): # --------------------------- cos(theta) & phi(theta) --------------------------- if self.device_id == None: cosine = F.linear(F.normalize(input), F.normalize(self.weight)) else: x = input sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) cosine = F.linear(F.normalize(temp_x), F.normalize(weight)) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0, phi, cosine) else: phi = torch.where(cosine > self.th, phi, cosine - self.mm) # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cosine.size()) if self.device_id != None: one_hot = one_hot.cuda(self.device_id[0]) one_hot.scatter_(1, label.view(-1, 1).long(), 1) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 output *= self.s return output
def PeepholeLSTMCell(input: torch.Tensor, hidden: Tuple[torch.Tensor, torch.Tensor], w_ih: torch.Tensor, w_hh: torch.Tensor, w_ip: torch.Tensor, w_fp: torch.Tensor, w_op: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ An LSTM cell with peephole connections without biases. Mostly ripped from the pytorch autograd lstm implementation. """ hx, cx = hidden gates = F.linear(input, w_ih) + F.linear(hx, w_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) peep_i = w_ip.unsqueeze(0).expand_as(cx) * cx ingate = ingate + peep_i peep_f = w_fp.unsqueeze(0).expand_as(cx) * cx forgetgate = forgetgate + peep_f ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) cy = (forgetgate * cx) + (ingate * cellgate) peep_o = w_op.unsqueeze(0).expand_as(cy) * cy outgate = outgate + peep_o hy = outgate * F.tanh(cy) return hy, cy
def f(params, inputs, mode): o = inputs.view(inputs.size(0), 1, 28, 28) o = F.conv2d(o, params['conv0.weight'], params['conv0.bias'], stride=2) o = F.relu(o) o = F.conv2d(o, params['conv1.weight'], params['conv1.bias'], stride=2) o = F.relu(o) o = o.view(o.size(0), -1) o = F.linear(o, params['linear2.weight'], params['linear2.bias']) o = F.relu(o) o = F.linear(o, params['linear3.weight'], params['linear3.bias']) return o
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): hx, cx = hidden gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = F.sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) return hy, cy
def decode(self, hiddens): """ Given the value for the hidden activations, do a decoding pass (update every other hidden activation and the visible input layer """ # starting with the hiddens, # update the reconstructed x and every other hidden layer using the activations from layers below and above for i in range(1, len(hiddens), 2): # odd layers # grab the parameters to use! (encode_w, bias), _ = self.layers[i] # encode up from below hidden = F.linear(input=hiddens[i-1], weight=encode_w, bias=bias) # decode down from above (if this isn't the top layer) if i < len(hiddens) - 1: (encode_w1, _), decode_w = self.layers[i+1] if decode_w is None: decode_w = encode_w1.t() hidden = hidden + F.linear(input=hiddens[i+1], weight=decode_w) # pre-activation noise hidden = self.hidden_corrupt(hidden) # apply activation hidden = self.hidden_act(hidden) # post-activation noise hidden = self.hidden_corrupt(hidden) # donezo for the hidden layer hiddens[i] = hidden # now do the reconstructed x! (encode_w1, _), decode_w = self.layers[0] if decode_w is None: decode_w = encode_w1.t() x_recon = F.linear(input=hiddens[0], weight=decode_w, bias=self.visible_bias) x_recon = self.visible_act(x_recon) # sample from p(X|H...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL if self.input_sampling: if isinstance(self.visible_act, nn.Sigmoid): sampled = self.sampling_fn(x_recon) else: print("Input sampling isn't defined for activation {!s}".format(type(self.visible_act))) sampled = x_recon else: sampled = x_recon return x_recon, hiddens, sampled
def do_decode(self, siz, seq_len, context_encoding, target): ses_inf_vec = self.ses_inf(context_encoding) context_encoding = self.tanh(self.ses_to_dec(context_encoding)) hid_n, preds, lm_preds = context_encoding, [], [] hid_n = hid_n.view(self.num_lyr, siz, self.hid_size) inp_tok = Variable(torch.ones(siz, 1).long()) lm_hid = Variable(torch.zeros(self.num_lyr, siz, self.hid_size)) if use_cuda: lm_hid = lm_hid.cuda() inp_tok = inp_tok.cuda() for i in range(seq_len): # initially tc_ratio is 1 but then slowly decays to 0 (to match inference time) if torch.randn(1)[0] < self.tc_ratio: inp_tok = target[:, i].unsqueeze(1) inp_tok_embedding = self.embed_in(inp_tok) emb_inf_vec = self.emb_inf(inp_tok_embedding) inp_tok_embedding = self.drop(inp_tok_embedding) hid_o, hid_n = self.rnn(inp_tok_embedding, hid_n) dec_hid_vec = self.dec_inf(hid_o) total_hid_o = dec_hid_vec + ses_inf_vec + emb_inf_vec hid_o_mx = max_out(total_hid_o) hid_o_mx = F.linear(hid_o_mx, self.embed_in.weight) if self.shared_weight else self.embed_out(hid_o_mx) preds.append(hid_o_mx) if self.train_lm: lm_o, lm_hid = self.lm(inp_tok_embedding, lm_hid) lm_o = self.lin3(lm_o) lm_o = F.linear(lm_o, self.embed_in.weight) if self.shared_weight else self.embed_out(lm_o) lm_preds.append(lm_o) op = hid_o[:, :, :-1] op = F.log_softmax(op, 2, 5) max_val, inp_tok = torch.max(op, dim=2) # now inp_tok will be val between 0 and 10002 ignoring padding_idx # here we do greedy decoding # so we can ignore the last symbol which is a padding token # technically we don't need a softmax here as we just want to choose the max token, max score will result in max softmax.Duh! dec_o = torch.cat(preds, 1) dec_lmo = torch.cat(lm_preds, 1) if self.train_lm else None return dec_o, dec_lmo
def _in_proj(self, input, start=0, end=None): weight = self.in_proj_weight bias = self.in_proj_bias weight = weight[start:end, :] if bias is not None: bias = bias[start:end] return F.linear(input, weight, bias)
def forward(self, x): if self.device_id == None: out = F.linear(x, self.weight, self.bias) else: sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) sub_biases = torch.chunk(self.bias, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) bias = sub_biases[0].cuda(self.device_id[0]) out = F.linear(temp_x, weight, bias) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) bias = sub_biases[i].cuda(self.device_id[i]) out = torch.cat((out, F.linear(temp_x, weight, bias).cuda(self.device_id[0])), dim=1) return out
def incremental_forward(self, input): """Forward convolution one time step at a time. This function maintains an internal state to buffer signal and accepts a single frame as input. If the input order changes between time steps, call reorder_incremental_state. To apply to fresh inputs, call clear_incremental_state. """ # reshape weight weight = self._get_linearized_weight() kw = self.kernel_size[0] bsz = input.size(0) # input: bsz x len x dim if kw > 1: input = input.data if self.input_buffer is None: self.input_buffer = input.new(bsz, kw, input.size(2)) self.input_buffer.zero_() else: # shift buffer self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() # append next input self.input_buffer[:, -1, :] = input[:, -1, :] input = torch.autograd.Variable(self.input_buffer, volatile=True) output = F.linear(input.view(bsz, -1), weight, self.bias) return output.view(bsz, 1, -1)
def forward(self, input): self.epsilon_weight.normal_() bias = self.bias if bias is not None: self.epsilon_bias.normal_() bias = bias + self.sigma_bias * self.epsilon_bias.data return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
def forward(self, x): # flatten input if len(x.size()) > 2: x = x.view(-1, int(np.prod(x.size()[1:]))) # corrupt input x = self.input_corrupt(x) corrupted = x # encode for layer in self.encode_layers: x = layer(x) x = F.relu(x) # decode if self.tied_weights: for i, (layer, bias) in enumerate(self.decode_params): x = F.linear(x, weight=layer.weight.t(), bias=bias) if i == len(self.decode_params)-1: x = self.visible_act(x) else: x = F.relu(x) else: for i, layer in enumerate(self.decode_layers): x = layer(x) if i == len(self.decode_layers)-1: x = self.visible_act(x) else: x = F.relu(x) return x, corrupted
def forward(self, input): weight = self.weight if self.shared: # detach weight to prevent gradients from changing weight # (but need to detach every time so weights are up to date) weight = weight.detach() return F.linear(input, weight, self.bias)
def F_affine3d(x, matrix, center=True): A = matrix[:3,:3] b = matrix[:3,3] # make a meshgrid of normal coordinates coords = Variable(th_iterproduct(x.size(1),x.size(2),x.size(3)).float(), requires_grad=False) if center: # shift the coordinates so center is the origin coords[:,0] = coords[:,0] - (x.size(1) / 2. + 0.5) coords[:,1] = coords[:,1] - (x.size(2) / 2. + 0.5) coords[:,2] = coords[:,2] - (x.size(3) / 2. + 0.5) # apply the coordinate transformation new_coords = F.linear(coords, A, b) if center: # shift the coordinates back so origin is origin new_coords[:,0] = new_coords[:,0] + (x.size(1) / 2. + 0.5) new_coords[:,1] = new_coords[:,1] + (x.size(2) / 2. + 0.5) new_coords[:,2] = new_coords[:,2] + (x.size(3) / 2. + 0.5) # map new coordinates using bilinear interpolation x_transformed = F_trilinear_interp3d(x, new_coords) return x_transformed
def regression_step(self, sequence_hiddens): """ Given the history list of GSN hiddens, make the next full list of gsn hiddens from our regression parameters """ sequence_reverse = sequence_hiddens[::-1] hiddens = [] for layer, _ in enumerate(self.sizes[1:]): if layer % 2 == 0: # do the window calculation for the layer! regression_terms = [] for window in range(self.window_size): if window < len(sequence_reverse): regression_weight = self.regression_weights[layer][window] regression_bias = self.regression_biases[layer] if window == 0 else None regression_terms.append( F.linear( input=sequence_reverse[window][layer], weight=regression_weight, bias=regression_bias ) ) else: regression_terms.append(self.missing_biases[layer][window]) hiddens.append(sum(regression_terms)) else: hiddens.append(None) return hiddens
def forward(self, x): lrt_mean = self.bias lrt_std = torch.sqrt_(1e-16 + F.linear(x * x, self.sigma * self.sigma)) if self.training: eps = Variable(lrt_std.data.new(lrt_std.size()).normal_()) else: eps = 0.0 return lrt_mean + eps * lrt_std
def forward(self, x): if self.training: eps = Variable(torch.bernoulli(self.probs) - 0.5) else: eps = 0.0 output = F.linear(x, self.W*eps) if self.bias is not None: output = output + self.bias return output
def forward(self, x): if self.training: eps = Variable(self.W.data.new(self.W.size()).uniform_() - 0.5) else: eps = 0.0 output = F.linear(x, self.W*eps) if self.bias is not None: output = output + self.bias return output
def forward(self, x): if self.zero_mean: lrt_mean = 0.0 else: lrt_mean = F.linear(x, self.W) if self.bias is not None: lrt_mean = lrt_mean + self.bias sigma2 = Variable.exp(self.log_alpha) * self.W * self.W if self.permute_sigma: sigma2 = sigma2.view(-1)[torch.randperm(self.in_features * self.out_features).cuda()].view(self.out_features, self.in_features) lrt_std = Variable.sqrt(1e-16 + F.linear(x * x, sigma2)) if self.training: eps = Variable(lrt_std.data.new(lrt_std.size()).normal_()) else: eps = 0.0 return lrt_mean + lrt_std * eps
def forward(self, x): if self.deterministic: assert self.training == False, "Flag deterministic is True. This should not be used in training." return F.linear(x, self.post_weight_mu, self.bias_mu) batch_size = x.size()[0] # compute z # note that we reparametrise according to [2] Eq. (11) (not [1]) z = reparametrize(self.z_mu.repeat(batch_size, 1), self.z_logvar.repeat(batch_size, 1), sampling=self.training, cuda=self.cuda) # apply local reparametrisation trick see [1] Eq. (6) # to the parametrisation given in [3] Eq. (6) xz = x * z mu_activations = F.linear(xz, self.weight_mu, self.bias_mu) var_activations = F.linear(xz.pow(2), self.weight_logvar.exp(), self.bias_logvar.exp()) return reparametrize(mu_activations, var_activations.log(), sampling=self.training, cuda=self.cuda)
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): """ A modified LSTM cell with hard sigmoid activation on the input, forget and output gates. """ hx, cx = hidden gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = hard_sigmoid(ingate) forgetgate = hard_sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = hard_sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) return hy, cy
def f(input, params, mode): x = F.conv2d(input, params['conv0'], padding=1) g0 = group(x, params, 'group0', mode, 1) g1 = group(g0, params, 'group1', mode, 2) g2 = group(g1, params, 'group2', mode, 2) o = F.relu(utils.batch_norm(g2, params, 'bn', mode)) o = F.avg_pool2d(o, 8, 1, 0) o = o.view(o.size(0), -1) o = F.linear(o, params['fc.weight'], params['fc.bias']) return o
def encode(self, x, hiddens): """ Given the value for x and hidden activations, do an encoding pass (update every other hidden activation) """ # starting with x and the hiddens, # update every other hidden layer using the activations from layers below and above corrupted_x = self.input_corrupt(x) for i in range(0, len(hiddens), 2): # even layers # grab the parameters to use! (encode_w, bias), _ = self.layers[i] # encode up from below # if first layer, use x, otherwise use the hidden from below if i == 0: below = corrupted_x else: below = hiddens[i-1] hidden = F.linear(input=below, weight=encode_w, bias=bias) # decode down from above (if this isn't the top layer) if i < len(hiddens)-1: (encode_w1, _), decode_w = self.layers[i+1] if decode_w is None: decode_w = encode_w1.t() hidden = hidden + F.linear(input=hiddens[i+1], weight=decode_w) # pre-activation noise if not (i == 0 and self.noiseless_h1): hidden = self.hidden_corrupt(hidden) # apply activation hidden = self.hidden_act(hidden) # post-activation noise if not (i == 0 and self.noiseless_h1): hidden = self.hidden_corrupt(hidden) # donezo for the hidden layer hiddens[i] = hidden return hiddens
def do_decode_tc(self, context_encoding, target, target_lengths): #print(target.size(), target_lengths) target_emb = self.embed_in(target) target_emb = self.drop(target_emb) # below will be used later as a crude approximation of an LM emb_inf_vec = self.emb_inf(target_emb) target_emb = torch.nn.utils.rnn.pack_padded_sequence(target_emb, target_lengths, batch_first=True) #print(context_encoding.size()) init_hidn = self.tanh(self.ses_to_dec(context_encoding)) #print(init_hidn.size()) init_hidn = init_hidn.view(self.num_lyr, target.size(0), self.hid_size) hid_o, hid_n = self.rnn(target_emb, init_hidn) hid_o, _ = torch.nn.utils.rnn.pad_packed_sequence(hid_o, batch_first=True) # linear layers not compatible with PackedSequence need to unpack, will be 0s at padded timesteps! dec_hid_vec = self.dec_inf(hid_o) ses_inf_vec = self.ses_inf(context_encoding) #print(dec_hid_vec.size(), ses_inf_vec.size(), emb_inf_vec.size()) total_hid_o = dec_hid_vec + ses_inf_vec + emb_inf_vec hid_o_mx = max_out(total_hid_o) hid_o_mx = F.linear(hid_o_mx, self.embed_in.weight) if self.shared_weight else self.embed_out(hid_o_mx) if self.train_lm: siz = target.size(0) lm_hid0 = Variable(torch.zeros(self.num_lyr, siz, self.hid_size)) if use_cuda: lm_hid0 = lm_hid0.cuda() lm_o, lm_hid = self.lm(target_emb, lm_hid0) lm_o, _ = torch.nn.utils.rnn.pad_packed_sequence(lm_o, batch_first=True) lm_o = self.lin3(lm_o) lm_o = F.linear(lm_o, self.embed_in.weight) if self.shared_weight else self.embed_out(lm_o) return hid_o_mx, lm_o else: return hid_o_mx, None
def forward(self, input, label): # --------------------------- cos(theta) & phi(theta) --------------------------- cosine = F.linear(F.normalize(input), F.normalize(self.weight)) phi = cosine - self.m # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cosine.size(), device = 'cuda') # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot one_hot.scatter_(1, label.view(-1, 1).long(), 1) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 output *= self.s return output
def forward(self, input): self.epsison_input.normal_() self.epsilon_output.normal_() func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) eps_in = func(self.epsilon_input.data) eps_out = func(self.epsilon_output.data) bias = self.bias if bias is not None: bias = bias + self.sigma_bias * eps_out.t() noise_v = torch.mul(eps_in, eps_out) return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
def f(input, params, pooling_classif=True): o = F.conv2d(input, params['conv0.weight'], params['conv0.bias'], 2, 3) o = F.relu(o) o = F.max_pool2d(o, 3, 2, 1) o_g0 = group(o, params, 'group0', 1, blocks[0]) o_g1 = group(o_g0, params, 'group1', 2, blocks[1]) o_g2 = group(o_g1, params, 'group2', 2, blocks[2]) o_g3 = group(o_g2, params, 'group3', 2, blocks[3]) if pooling_classif: o = F.avg_pool2d(o_g3, 7, 1, 0) o = o.view(o.size(0), -1) o = F.linear(o, params['fc.weight'], params['fc.bias']) return o
def test_reuse_function(self): @torch.jit.compile(nderivs=0) def clinear(*args): return F.linear(*args) def cast(x): return x input = Variable(cast(torch.randn(1, 1))) weights = Variable(cast(torch.randn(1, 1))) bias = Variable(cast(torch.randn(1, 1))) # linear AKA addmm without bias is of particular interest # because we allocate a zero-filled new variable when we execute, # and then *fill* it with the result r1_ = clinear(input, weights) with self.assertCompiled(clinear): r1 = clinear(r1_, weights) r2 = F.linear(F.linear(input, weights), weights) self.assertEqual(r1, r2)
def forward(self, X): """ Funciton call to generate the output, every time we call it, the dynamic graph is created. There can be difference between forward in training and test: - In dropout we do not zero neurons in test - In Variational Inference we dont randombly sample from the posterior We create the forward pass by performing operations between the input X (Nsam_batch, Ndim) and the parameters of the model that we should have initialized in the __init__ """ # o2 = torch.mm(X, self.weight) + self.bias o2 = F.linear(X, self.weight, self.bias) return o2
def forward(self, x, label): cosine = F.linear(F.normalize(x), F.normalize(self.weight)) sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0, phi, cosine) else: phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) one_hot = torch.zeros(cosine.size(), device='cuda') one_hot.scatter_(1, label.view(-1, 1).long(), 1) output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= self.s return output
def forward(self, input, label): # lambda = max(lambda_min,base*(1+gamma*iteration)^(-power)) self.iter += 1 self.lamb = max(self.LambdaMin, self.base * (1 + self.gamma * self.iter) ** (-1 * self.power)) # --------------------------- cos(theta) & phi(theta) --------------------------- if self.device_id == None: cos_theta = F.linear(F.normalize(input), F.normalize(self.weight)) else: x = input sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) cos_theta = F.linear(F.normalize(temp_x), F.normalize(weight)) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) cos_theta = torch.cat((cos_theta, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) cos_theta = cos_theta.clamp(-1, 1) cos_m_theta = self.mlambda[self.m](cos_theta) theta = cos_theta.data.acos() k = (self.m * theta / 3.14159265).floor() phi_theta = ((-1.0) ** k) * cos_m_theta - 2 * k NormOfFeature = torch.norm(input, 2, 1) # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cos_theta.size()) if self.device_id != None: one_hot = one_hot.cuda(self.device_id[0]) one_hot.scatter_(1, label.view(-1, 1), 1) # --------------------------- Calculate output --------------------------- output = (one_hot * (phi_theta - cos_theta) / (1 + self.lamb)) + cos_theta output *= NormOfFeature.view(-1, 1) return output
def linear_classifier(x, param_dict): """ Classifier. """ return F.linear(x, param_dict['weight_mean'], param_dict['bias_mean'])
def forward(self, x, time): o = self.context[:, int(time)] return F.linear(x * o, self.weight, self.bias)
def forward(self, input): return F.linear(input, self.mask * self.weight, self.bias)
def _forward(self, x, weights): x = torch.flatten(x) x = F.linear(x, weights['lin.weight'], bias=weights['lin.bias']) x = x.reshape((1, 10, self.largest_w, self.largest_h)) return x
def test_jit_class_using_function(): LinearJIT = nnfusion.jit(torch.nn.Linear) model = LinearJIT(8, 8).cuda().eval() t = torch.randn(1, 8, device="cuda") assert_allclose(F.linear(t, model.weight, model.bias), model(t))
def forward(self, x): out = F.linear(F.normalize(x), F.normalize(self.weight)) return out
def _forward_rho(rho, incr, ctx, input, weight, bias): ctx.save_for_backward(input, weight, bias) ctx.rho = rho ctx.incr = incr return F.linear(input, weight, bias)
def _forward_alpha_beta(ctx, input, weight, bias): Z = F.linear(input, weight, bias) ctx.save_for_backward(input, weight, bias) return Z
def forward(self, x, vars=None, bn_training=True): """ This function can be called by finetunning, however, in finetunning, we dont wish to update running_mean/running_var. Thought weights/bias of bn is updated, it has been separated by fast_weights. Indeed, to not update running_mean/running_var, we need set update_bn_statistics=False but weight/bias will be updated and not dirty initial theta parameters via fast_weiths. :param x: [b, 1, 28, 28] :param vars: :param bn_training: set False to not update :return: x, loss, likelihood, kld """ if vars is None: vars = self.vars idx = 0 bn_idx = 0 for name, param in self.config: if name is 'conv2d': w, b = vars[idx], vars[idx + 1] # remember to keep synchrozied of forward_encoder and forward_decoder! x = F.conv2d(x, w, b, stride=param[4], padding=param[5]) idx += 2 # print(name, param, '\tout:', x.shape) elif name is 'convt2d': w, b = vars[idx], vars[idx + 1] # remember to keep synchrozied of forward_encoder and forward_decoder! x = F.conv_transpose2d(x, w, b, stride=param[4], padding=param[5]) idx += 2 # print(name, param, '\tout:', x.shape) elif name is 'linear': w, b = vars[idx], vars[idx + 1] x = F.linear(x, w, b) idx += 2 # print('forward:', idx, x.norm().item()) elif name is 'bn': w, b = vars[idx], vars[idx + 1] running_mean, running_var = self.vars_bn[bn_idx], self.vars_bn[ bn_idx + 1] x = F.batch_norm(x, running_mean, running_var, weight=w, bias=b, training=bn_training) idx += 2 bn_idx += 2 elif name is 'flatten': # print(x.shape) x = x.view(x.size(0), -1) elif name is 'reshape': # [b, 8] => [b, 2, 2, 2] x = x.view(x.size(0), *param) elif name is 'relu': x = F.relu(x, inplace=param[0]) elif name is 'leakyrelu': x = F.leaky_relu(x, negative_slope=param[0], inplace=param[1]) elif name is 'tanh': x = F.tanh(x) elif name is 'sigmoid': x = torch.sigmoid(x) elif name is 'upsample': x = F.upsample_nearest(x, scale_factor=param[0]) elif name is 'max_pool2d': x = F.max_pool2d(x, param[0], param[1], param[2]) elif name is 'avg_pool2d': x = F.avg_pool2d(x, param[0], param[1], param[2]) else: raise NotImplementedError # make sure variable is used properly assert idx == len(vars) assert bn_idx == len(self.vars_bn) return x
def forward(self, x): return F.linear(x, self.weight + self.sigma_weight * self.epsilon_weight, self.bias + self.sigma_bias * self.epsilon_bias)
def f(x): for (weight, bias, act, norm) in zip(weights, biases, self.acts, self.layer_norms): x = norm(act(self.dropout(F.linear(x, weight, bias)))) return x
def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None): encoder_out = encoder_out_dict['encoder_out'] encoder_padding_mask = encoder_out_dict['encoder_padding_mask'] if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] bsz, seqlen = prev_output_tokens.size() # get outputs from encoder encoder_outs, _, _ = encoder_out[:3] srclen = encoder_outs.size(0) # embed tokens x = self.embed_tokens(prev_output_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental generation) cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state') if cached_state is not None: prev_hiddens, prev_cells, input_feed = cached_state else: _, encoder_hiddens, encoder_cells = encoder_out[:3] num_layers = len(self.layers) prev_hiddens = [encoder_hiddens[i] for i in range(num_layers)] prev_cells = [encoder_cells[i] for i in range(num_layers)] input_feed = x.data.new(bsz, self.encoder_output_units).zero_() attn_scores = x.data.new(srclen, seqlen, bsz).zero_() outs = [] for j in range(seqlen): # input feeding: concatenate context vector from previous time step input = torch.cat((x[j, :, :], input_feed), dim=1) for i, rnn in enumerate(self.layers): # recurrent cell hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i])) # hidden state becomes the input to the next layer input = F.dropout(hidden, p=self.dropout_out, training=self.training) # save state for next time step prev_hiddens[i] = hidden prev_cells[i] = cell # apply attention using the last layer's hidden state if self.attention is not None: out, attn_scores[:, j, :] = self.attention( hidden, encoder_outs, encoder_padding_mask) else: out = hidden out = F.dropout(out, p=self.dropout_out, training=self.training) # input feeding input_feed = out # save final output outs.append(out) # cache previous states (no-op except during incremental generation) utils.set_incremental_state(self, incremental_state, 'cached_state', (prev_hiddens, prev_cells, input_feed)) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) # T x B x C -> B x T x C x = x.transpose(1, 0) # srclen x tgtlen x bsz -> bsz x tgtlen x srclen if not self.training and self.need_attn: attn_scores = attn_scores.transpose(0, 2) else: attn_scores = None # project back to size of vocabulary if self.adaptive_softmax is None: if hasattr(self, 'additional_fc'): x = self.additional_fc(x) x = F.dropout(x, p=self.dropout_out, training=self.training) if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = self.fc_out(x) return x, attn_scores
def h_to_v(self, h): p_v = Function.sigmoid(Function.linear(h, self.W.t(), self.v_bias)) sample_v = self.sample_p(p_v) return p_v, sample_v
def linear_biprec(input, weight, bias=None, num_bits_grad=None): out1 = F.linear(input.detach(), weight, bias) out2 = F.linear(input, weight.detach(), bias.detach() if bias is not None else None) out2 = quantize_grad(out2, num_bits=num_bits_grad) return out1 + out2 - out1.detach()
def v_to_h(self, v): p_h = Function.sigmoid(Function.linear(v, self.W, self.h_bias)) sample_h = self.sample_p(p_h) return p_h, sample_h
def forward(self, x): return F.linear(x, self.W_(), self.bias)
def forward(self, x): return F.linear(x, self.omega * self.weight, self.beta * self.bias)
def forward(self, u, y=None): x = F.linear(u, self.weight, self.bias) if y is not None: x += F.linear(y, self.weight, self.bias) return x
def forward(self, x): out = F.linear(x, self.weight, self.bias) out = self.activation(out) return out
def forward(self, input): return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)
def forward(self, x): W = self.weight * self.weight_scale / torch.sqrt( torch.sum(self.weight**2, dim=1, keepdim=True)) return F.linear(x, W, self.bias)
def IndRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None): hy = F.tanh(F.linear(input, w_ih, b_ih) + F.mul(w_hh, hidden)) return hy
def IndRNNReLuCell(input, hidden, w_ih, w_hh, b_ih=None): hy = F.relu(F.linear(input, w_ih, b_ih) + F.mul(w_hh, hidden)) return hy
def free_energy_cost(self, v): vbias_term = v.mv(self.v_bias) wx_b = Function.linear(v, self.W, self.h_bias) hidden_term = wx_b.exp().add(1).log().sum(1) return (-hidden_term - vbias_term).mean()
def decode(self, x): maskedweight = self.weight * self.mask catweight = torch.cat((maskedweight, self.weight_fc), dim=0) return F.linear(x, catweight.t(), self.vbias)
def forward(self, features): cosine = F.linear(l2_norm(features), l2_norm(self.weight)) return cosine
def forward(self, x): maskedweight = self.weight * self.mask catweight = torch.cat((maskedweight, self.weight_fc), dim=0) catbias = torch.cat((self.bias, self.bias_fc)) return self.dropout(self.enc_act_func(F.linear(x, catweight, catbias)))
def _forward_pattern(attribution, ctx, input, weight, bias, pattern): ctx.save_for_backward(input, weight, pattern) ctx.attribution = attribution return F.linear(input, weight, bias)
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states}
def forward(self, x, weights=None, get_feat=False): if weights == None: # block1 x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = self.pool(x) # block2 x = F.relu(self.bn3(self.conv3(x))) x = F.relu(self.bn4(self.conv4(x))) x = self.pool(x) # block3 x = F.relu(self.bn5(self.conv5(x))) x = F.relu(self.bn6(self.conv6(x))) x = self.pool(x) x = x.view(-1, 196 * 4 * 4) feat = F.relu(self.bn7(self.fc1(x))) x = self.fc2(feat) if get_feat: return x, feat else: return x else: list_weights = list(weights.items()) count = 0 #block1 x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn1.running_mean, self.bn1.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn2.running_mean, self.bn2.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.max_pool2d(x, kernel_size=2, stride=2) #block2 x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn3.running_mean, self.bn3.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn4.running_mean, self.bn4.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.max_pool2d(x, kernel_size=2, stride=2) #block3 x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn5.running_mean, self.bn5.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.conv2d(x, list_weights[count][1], list_weights[count + 1][1], padding=1, stride=1) count += 2 x = F.batch_norm(x, self.bn6.running_mean, self.bn6.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 x = F.threshold(x, 0, 0, inplace=True) x = F.max_pool2d(x, kernel_size=2, stride=2) x = x.view(-1, 196 * 4 * 4) x = F.linear(x, list_weights[count][1], list_weights[count + 1][1]) count += 2 x = F.batch_norm(x, self.bn7.running_mean, self.bn7.running_var, list_weights[count][1], list_weights[count + 1][1], training=True) count += 2 feat = F.threshold(x, 0, 0, inplace=True) x = F.linear(feat, list_weights[count][1], list_weights[count + 1][1]) return x