def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) 
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output
Exemple #2
0
def PeepholeLSTMCell(input: torch.Tensor,
                     hidden: Tuple[torch.Tensor, torch.Tensor],
                     w_ih: torch.Tensor,
                     w_hh: torch.Tensor,
                     w_ip: torch.Tensor,
                     w_fp: torch.Tensor,
                     w_op: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    An LSTM cell with peephole connections without biases.

    Mostly ripped from the pytorch autograd lstm implementation.
    """
    hx, cx = hidden
    gates = F.linear(input, w_ih) + F.linear(hx, w_hh)

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
    peep_i = w_ip.unsqueeze(0).expand_as(cx) * cx
    ingate = ingate + peep_i
    peep_f = w_fp.unsqueeze(0).expand_as(cx) * cx
    forgetgate = forgetgate + peep_f

    ingate = F.sigmoid(ingate)
    forgetgate = F.sigmoid(forgetgate)
    cellgate = F.tanh(cellgate)
    cy = (forgetgate * cx) + (ingate * cellgate)
    peep_o = w_op.unsqueeze(0).expand_as(cy) * cy
    outgate = outgate + peep_o
    hy = outgate * F.tanh(cy)

    return hy, cy
Exemple #3
0
def f(params, inputs, mode):
    o = inputs.view(inputs.size(0), 1, 28, 28)
    o = F.conv2d(o, params['conv0.weight'], params['conv0.bias'], stride=2)
    o = F.relu(o)
    o = F.conv2d(o, params['conv1.weight'], params['conv1.bias'], stride=2)
    o = F.relu(o)
    o = o.view(o.size(0), -1)
    o = F.linear(o, params['linear2.weight'], params['linear2.bias'])
    o = F.relu(o)
    o = F.linear(o, params['linear3.weight'], params['linear3.bias'])
    return o
Exemple #4
0
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
    hx, cx = hidden
    gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
    ingate = F.sigmoid(ingate)
    forgetgate = F.sigmoid(forgetgate)
    cellgate = F.tanh(cellgate)
    outgate = F.sigmoid(outgate)

    cy = (forgetgate * cx) + (ingate * cellgate)
    hy = outgate * F.tanh(cy)
    return hy, cy
Exemple #5
0
    def decode(self, hiddens):
        """
        Given the value for the hidden activations, do a decoding pass
        (update every other hidden activation and the visible input layer
        """
        # starting with the hiddens,
        # update the reconstructed x and every other hidden layer using the activations from layers below and above
        for i in range(1, len(hiddens), 2):  # odd layers
            # grab the parameters to use!
            (encode_w, bias), _ = self.layers[i]
            # encode up from below
            hidden = F.linear(input=hiddens[i-1], weight=encode_w, bias=bias)

            # decode down from above (if this isn't the top layer)
            if i < len(hiddens) - 1:
                (encode_w1, _), decode_w = self.layers[i+1]
                if decode_w is None:
                    decode_w = encode_w1.t()
                hidden = hidden + F.linear(input=hiddens[i+1], weight=decode_w)

            # pre-activation noise
            hidden = self.hidden_corrupt(hidden)

            # apply activation
            hidden = self.hidden_act(hidden)

            # post-activation noise
            hidden = self.hidden_corrupt(hidden)

            # donezo for the hidden layer
            hiddens[i] = hidden

        # now do the reconstructed x!
        (encode_w1, _), decode_w = self.layers[0]
        if decode_w is None:
            decode_w = encode_w1.t()
        x_recon = F.linear(input=hiddens[0], weight=decode_w, bias=self.visible_bias)
        x_recon = self.visible_act(x_recon)
        # sample from p(X|H...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL
        if self.input_sampling:
            if isinstance(self.visible_act, nn.Sigmoid):
                sampled = self.sampling_fn(x_recon)
            else:
                print("Input sampling isn't defined for activation {!s}".format(type(self.visible_act)))
                sampled = x_recon
        else:
            sampled = x_recon

        return x_recon, hiddens, sampled
    def do_decode(self, siz, seq_len, context_encoding, target):
        ses_inf_vec = self.ses_inf(context_encoding)
        context_encoding = self.tanh(self.ses_to_dec(context_encoding))
        hid_n, preds, lm_preds = context_encoding, [], []

        hid_n = hid_n.view(self.num_lyr, siz, self.hid_size)
        inp_tok = Variable(torch.ones(siz, 1).long())
        lm_hid = Variable(torch.zeros(self.num_lyr, siz, self.hid_size))
        if use_cuda:
            lm_hid = lm_hid.cuda()
            inp_tok = inp_tok.cuda()


        for i in range(seq_len):
            # initially tc_ratio is 1 but then slowly decays to 0 (to match inference time)
            if torch.randn(1)[0] < self.tc_ratio:
                inp_tok = target[:, i].unsqueeze(1)

            inp_tok_embedding = self.embed_in(inp_tok)
            emb_inf_vec = self.emb_inf(inp_tok_embedding)

            inp_tok_embedding = self.drop(inp_tok_embedding)

            hid_o, hid_n = self.rnn(inp_tok_embedding, hid_n)
            dec_hid_vec = self.dec_inf(hid_o)

            total_hid_o = dec_hid_vec + ses_inf_vec + emb_inf_vec
            hid_o_mx = max_out(total_hid_o)

            hid_o_mx = F.linear(hid_o_mx, self.embed_in.weight) if self.shared_weight else self.embed_out(hid_o_mx)
            preds.append(hid_o_mx)

            if self.train_lm:
                lm_o, lm_hid = self.lm(inp_tok_embedding, lm_hid)
                lm_o = self.lin3(lm_o)
                lm_o = F.linear(lm_o, self.embed_in.weight) if self.shared_weight else self.embed_out(lm_o)
                lm_preds.append(lm_o)

            op = hid_o[:, :, :-1]
            op = F.log_softmax(op, 2, 5)
            max_val, inp_tok = torch.max(op, dim=2)
            # now inp_tok will be val between 0 and 10002 ignoring padding_idx
            # here we do greedy decoding
            # so we can ignore the last symbol which is a padding token
            # technically we don't need a softmax here as we just want to choose the max token, max score will result in max softmax.Duh!

        dec_o = torch.cat(preds, 1)
        dec_lmo = torch.cat(lm_preds, 1) if self.train_lm else None
        return dec_o, dec_lmo
Exemple #7
0
 def _in_proj(self, input, start=0, end=None):
     weight = self.in_proj_weight
     bias = self.in_proj_bias
     weight = weight[start:end, :]
     if bias is not None:
         bias = bias[start:end]
     return F.linear(input, weight, bias)
 def forward(self, x):
     if self.device_id == None:
         out = F.linear(x, self.weight, self.bias)
     else:
         sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
         sub_biases = torch.chunk(self.bias, len(self.device_id), dim=0)
         temp_x = x.cuda(self.device_id[0])
         weight = sub_weights[0].cuda(self.device_id[0])
         bias = sub_biases[0].cuda(self.device_id[0])
         out = F.linear(temp_x, weight, bias)
         for i in range(1, len(self.device_id)):
             temp_x = x.cuda(self.device_id[i])
             weight = sub_weights[i].cuda(self.device_id[i])
             bias = sub_biases[i].cuda(self.device_id[i])
             out = torch.cat((out, F.linear(temp_x, weight, bias).cuda(self.device_id[0])), dim=1)
     return out
    def incremental_forward(self, input):
        """Forward convolution one time step at a time.

        This function maintains an internal state to buffer signal and accepts
        a single frame as input. If the input order changes between time steps,
        call reorder_incremental_state. To apply to fresh inputs, call
        clear_incremental_state.
        """
        # reshape weight
        weight = self._get_linearized_weight()
        kw = self.kernel_size[0]

        bsz = input.size(0)  # input: bsz x len x dim
        if kw > 1:
            input = input.data
            if self.input_buffer is None:
                self.input_buffer = input.new(bsz, kw, input.size(2))
                self.input_buffer.zero_()
            else:
                # shift buffer
                self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
            # append next input
            self.input_buffer[:, -1, :] = input[:, -1, :]
            input = torch.autograd.Variable(self.input_buffer, volatile=True)
        output = F.linear(input.view(bsz, -1), weight, self.bias)
        return output.view(bsz, 1, -1)
 def forward(self, input):
     self.epsilon_weight.normal_()
     bias = self.bias
     if bias is not None:
         self.epsilon_bias.normal_()
         bias = bias + self.sigma_bias * self.epsilon_bias.data
     return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
Exemple #11
0
    def forward(self, x):
        # flatten input
        if len(x.size()) > 2:
            x = x.view(-1, int(np.prod(x.size()[1:])))
        # corrupt input
        x = self.input_corrupt(x)
        corrupted = x
        # encode
        for layer in self.encode_layers:
            x = layer(x)
            x = F.relu(x)
        # decode
        if self.tied_weights:
            for i, (layer, bias) in enumerate(self.decode_params):
                x = F.linear(x, weight=layer.weight.t(), bias=bias)
                if i == len(self.decode_params)-1:
                    x = self.visible_act(x)
                else:
                    x = F.relu(x)
        else:
            for i, layer in enumerate(self.decode_layers):
                x = layer(x)
                if i == len(self.decode_layers)-1:
                    x = self.visible_act(x)
                else:
                    x = F.relu(x)

        return x, corrupted
Exemple #12
0
 def forward(self, input):
     weight = self.weight
     if self.shared:
         # detach weight to prevent gradients from changing weight
         # (but need to detach every time so weights are up to date)
         weight = weight.detach()
     return F.linear(input, weight, self.bias)
Exemple #13
0
def F_affine3d(x, matrix, center=True):
    A = matrix[:3,:3]
    b = matrix[:3,3]

    # make a meshgrid of normal coordinates
    coords = Variable(th_iterproduct(x.size(1),x.size(2),x.size(3)).float(),
                requires_grad=False)

    if center:
        # shift the coordinates so center is the origin
        coords[:,0] = coords[:,0] - (x.size(1) / 2. + 0.5)
        coords[:,1] = coords[:,1] - (x.size(2) / 2. + 0.5)
        coords[:,2] = coords[:,2] - (x.size(3) / 2. + 0.5)

    
    # apply the coordinate transformation
    new_coords = F.linear(coords, A, b)

    if center:
        # shift the coordinates back so origin is origin
        new_coords[:,0] = new_coords[:,0] + (x.size(1) / 2. + 0.5)
        new_coords[:,1] = new_coords[:,1] + (x.size(2) / 2. + 0.5)
        new_coords[:,2] = new_coords[:,2] + (x.size(3) / 2. + 0.5)

    # map new coordinates using bilinear interpolation
    x_transformed = F_trilinear_interp3d(x, new_coords)

    return x_transformed
 def regression_step(self, sequence_hiddens):
     """
     Given the history list of GSN hiddens, make the next full list of gsn hiddens from our regression parameters
     """
     sequence_reverse = sequence_hiddens[::-1]
     hiddens = []
     for layer, _ in enumerate(self.sizes[1:]):
         if layer % 2 == 0:
             # do the window calculation for the layer!
             regression_terms = []
             for window in range(self.window_size):
                 if window < len(sequence_reverse):
                     regression_weight = self.regression_weights[layer][window]
                     regression_bias = self.regression_biases[layer] if window == 0 else None
                     regression_terms.append(
                         F.linear(
                             input=sequence_reverse[window][layer], weight=regression_weight, bias=regression_bias
                         )
                     )
                 else:
                     regression_terms.append(self.missing_biases[layer][window])
             hiddens.append(sum(regression_terms))
         else:
             hiddens.append(None)
     return hiddens
 def forward(self, x):
     lrt_mean = self.bias
     lrt_std = torch.sqrt_(1e-16 + F.linear(x * x, self.sigma * self.sigma))
     if self.training:
         eps = Variable(lrt_std.data.new(lrt_std.size()).normal_())
     else:
         eps = 0.0
     return lrt_mean + eps * lrt_std
Exemple #16
0
 def forward(self, x):
     if self.training:
         eps = Variable(torch.bernoulli(self.probs) - 0.5)
     else:
         eps = 0.0
     output = F.linear(x, self.W*eps)
     if self.bias is not None:
         output = output + self.bias
     return output
Exemple #17
0
 def forward(self, x):
     if self.training:
         eps = Variable(self.W.data.new(self.W.size()).uniform_() - 0.5)
     else:
         eps = 0.0
     output = F.linear(x, self.W*eps)
     if self.bias is not None:
         output = output + self.bias
     return output
Exemple #18
0
    def forward(self, x):
        if self.zero_mean:
            lrt_mean = 0.0
        else:
            lrt_mean = F.linear(x, self.W)
        if self.bias is not None:
            lrt_mean = lrt_mean + self.bias

        sigma2 = Variable.exp(self.log_alpha) * self.W * self.W
        if self.permute_sigma:
            sigma2 = sigma2.view(-1)[torch.randperm(self.in_features * self.out_features).cuda()].view(self.out_features, self.in_features)

        lrt_std = Variable.sqrt(1e-16 + F.linear(x * x, sigma2))
        if self.training:
            eps = Variable(lrt_std.data.new(lrt_std.size()).normal_())
        else:
            eps = 0.0
        return lrt_mean + lrt_std * eps
    def forward(self, x):
        if self.deterministic:
            assert self.training == False, "Flag deterministic is True. This should not be used in training."
            return F.linear(x, self.post_weight_mu, self.bias_mu)

        batch_size = x.size()[0]
        # compute z
        # note that we reparametrise according to [2] Eq. (11) (not [1])
        z = reparametrize(self.z_mu.repeat(batch_size, 1), self.z_logvar.repeat(batch_size, 1), sampling=self.training,
                          cuda=self.cuda)

        # apply local reparametrisation trick see [1] Eq. (6)
        # to the parametrisation given in [3] Eq. (6)
        xz = x * z
        mu_activations = F.linear(xz, self.weight_mu, self.bias_mu)
        var_activations = F.linear(xz.pow(2), self.weight_logvar.exp(), self.bias_logvar.exp())

        return reparametrize(mu_activations, var_activations.log(), sampling=self.training, cuda=self.cuda)
Exemple #20
0
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
    """
    A modified LSTM cell with hard sigmoid activation on the input, forget and output gates.
    """
    hx, cx = hidden
    gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

    ingate = hard_sigmoid(ingate)
    forgetgate = hard_sigmoid(forgetgate)
    cellgate = F.tanh(cellgate)
    outgate = hard_sigmoid(outgate)

    cy = (forgetgate * cx) + (ingate * cellgate)
    hy = outgate * F.tanh(cy)

    return hy, cy
 def f(input, params, mode):
     x = F.conv2d(input, params['conv0'], padding=1)
     g0 = group(x, params, 'group0', mode, 1)
     g1 = group(g0, params, 'group1', mode, 2)
     g2 = group(g1, params, 'group2', mode, 2)
     o = F.relu(utils.batch_norm(g2, params, 'bn', mode))
     o = F.avg_pool2d(o, 8, 1, 0)
     o = o.view(o.size(0), -1)
     o = F.linear(o, params['fc.weight'], params['fc.bias'])
     return o
Exemple #22
0
    def encode(self, x, hiddens):
        """
        Given the value for x and hidden activations, do an encoding pass (update every other hidden activation)
        """
        # starting with x and the hiddens,
        # update every other hidden layer using the activations from layers below and above
        corrupted_x = self.input_corrupt(x)
        for i in range(0, len(hiddens), 2):  # even layers
            # grab the parameters to use!
            (encode_w, bias), _ = self.layers[i]
            # encode up from below
            # if first layer, use x, otherwise use the hidden from below
            if i == 0:
                below = corrupted_x
            else:
                below = hiddens[i-1]
            hidden = F.linear(input=below, weight=encode_w, bias=bias)

            # decode down from above (if this isn't the top layer)
            if i < len(hiddens)-1:
                (encode_w1, _), decode_w = self.layers[i+1]
                if decode_w is None:
                    decode_w = encode_w1.t()
                hidden = hidden + F.linear(input=hiddens[i+1], weight=decode_w)


            # pre-activation noise
            if not (i == 0 and self.noiseless_h1):
                hidden = self.hidden_corrupt(hidden)

            # apply activation
            hidden = self.hidden_act(hidden)

            # post-activation noise
            if not (i == 0 and self.noiseless_h1):
                hidden = self.hidden_corrupt(hidden)

            # donezo for the hidden layer
            hiddens[i] = hidden

        return hiddens
    def do_decode_tc(self, context_encoding, target, target_lengths):
        #print(target.size(), target_lengths)
        target_emb = self.embed_in(target)
        target_emb = self.drop(target_emb)
        # below will be used later as a crude approximation of an LM
        emb_inf_vec = self.emb_inf(target_emb)

        target_emb = torch.nn.utils.rnn.pack_padded_sequence(target_emb, target_lengths, batch_first=True)

        #print(context_encoding.size())
        init_hidn = self.tanh(self.ses_to_dec(context_encoding))
        #print(init_hidn.size())
        init_hidn = init_hidn.view(self.num_lyr, target.size(0), self.hid_size)

        hid_o, hid_n = self.rnn(target_emb, init_hidn)
        hid_o, _ = torch.nn.utils.rnn.pad_packed_sequence(hid_o, batch_first=True)
        # linear layers not compatible with PackedSequence need to unpack, will be 0s at padded timesteps!

        dec_hid_vec = self.dec_inf(hid_o)
        ses_inf_vec = self.ses_inf(context_encoding)
        #print(dec_hid_vec.size(), ses_inf_vec.size(), emb_inf_vec.size())
        total_hid_o = dec_hid_vec + ses_inf_vec + emb_inf_vec

        hid_o_mx = max_out(total_hid_o)
        hid_o_mx = F.linear(hid_o_mx, self.embed_in.weight) if self.shared_weight else self.embed_out(hid_o_mx)

        if self.train_lm:
            siz = target.size(0)

            lm_hid0 = Variable(torch.zeros(self.num_lyr, siz, self.hid_size))
            if use_cuda:
                lm_hid0 = lm_hid0.cuda()

            lm_o, lm_hid = self.lm(target_emb, lm_hid0)
            lm_o, _ = torch.nn.utils.rnn.pad_packed_sequence(lm_o, batch_first=True)
            lm_o = self.lin3(lm_o)
            lm_o = F.linear(lm_o, self.embed_in.weight) if self.shared_weight else self.embed_out(lm_o)
            return hid_o_mx, lm_o
        else:
            return hid_o_mx, None
    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        phi = cosine - self.m
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device = 'cuda')
        # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output
    def forward(self, input):
        self.epsison_input.normal_()
        self.epsilon_output.normal_()

        func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
        eps_in = func(self.epsilon_input.data)
        eps_out = func(self.epsilon_output.data)

        bias = self.bias
        if bias is not None:
            bias = bias + self.sigma_bias * eps_out.t()
        noise_v = torch.mul(eps_in, eps_out)
        return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
 def f(input, params, pooling_classif=True):
     o = F.conv2d(input, params['conv0.weight'], params['conv0.bias'], 2, 3)
     o = F.relu(o)
     o = F.max_pool2d(o, 3, 2, 1)
     o_g0 = group(o, params, 'group0', 1, blocks[0])
     o_g1 = group(o_g0, params, 'group1', 2, blocks[1])
     o_g2 = group(o_g1, params, 'group2', 2, blocks[2])
     o_g3 = group(o_g2, params, 'group3', 2, blocks[3])
     if pooling_classif:
         o = F.avg_pool2d(o_g3, 7, 1, 0)
         o = o.view(o.size(0), -1)
         o = F.linear(o, params['fc.weight'], params['fc.bias'])
     return o
Exemple #27
0
    def test_reuse_function(self):
        @torch.jit.compile(nderivs=0)
        def clinear(*args):
            return F.linear(*args)

        def cast(x):
            return x

        input = Variable(cast(torch.randn(1, 1)))
        weights = Variable(cast(torch.randn(1, 1)))
        bias = Variable(cast(torch.randn(1, 1)))

        # linear AKA addmm without bias is of particular interest
        # because we allocate a zero-filled new variable when we execute,
        # and then *fill* it with the result

        r1_ = clinear(input, weights)
        with self.assertCompiled(clinear):
            r1 = clinear(r1_, weights)
        r2 = F.linear(F.linear(input, weights), weights)

        self.assertEqual(r1, r2)
Exemple #28
0
    def forward(self, X):
        """
        Funciton call to generate the output, every time we call it, the dynamic graph is created.
        There can be difference between forward in training and test:
            - In dropout we do not zero neurons in test
            - In Variational Inference we dont randombly sample from the posterior
        
        We create the forward pass by performing operations between the input X (Nsam_batch, Ndim)
        and the parameters of the model that we should have initialized in the __init__
        """
        
#        o2 = torch.mm(X, self.weight) + self.bias
        o2 = F.linear(X, self.weight, self.bias)
        return o2
    def forward(self, x, label):
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)

        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output
    def forward(self, input, label):
        # lambda = max(lambda_min,base*(1+gamma*iteration)^(-power))
        self.iter += 1
        self.lamb = max(self.LambdaMin, self.base * (1 + self.gamma * self.iter) ** (-1 * self.power))

        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if self.device_id == None:
            cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cos_theta = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cos_theta = torch.cat((cos_theta, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1)

        cos_theta = cos_theta.clamp(-1, 1)
        cos_m_theta = self.mlambda[self.m](cos_theta)
        theta = cos_theta.data.acos()
        k = (self.m * theta / 3.14159265).floor()
        phi_theta = ((-1.0) ** k) * cos_m_theta - 2 * k
        NormOfFeature = torch.norm(input, 2, 1)

        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cos_theta.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        one_hot.scatter_(1, label.view(-1, 1), 1)

        # --------------------------- Calculate output ---------------------------
        output = (one_hot * (phi_theta - cos_theta) / (1 + self.lamb)) + cos_theta
        output *= NormOfFeature.view(-1, 1)

        return output
Exemple #31
0
def linear_classifier(x, param_dict):
    """
    Classifier.
    """
    return F.linear(x, param_dict['weight_mean'], param_dict['bias_mean'])
Exemple #32
0
 def forward(self, x, time):
     o = self.context[:, int(time)]
     return F.linear(x * o, self.weight, self.bias)
Exemple #33
0
 def forward(self, input):
     return F.linear(input, self.mask * self.weight, self.bias)
Exemple #34
0
 def _forward(self, x, weights):
     x = torch.flatten(x)
     x = F.linear(x, weights['lin.weight'], bias=weights['lin.bias'])
     x = x.reshape((1, 10, self.largest_w, self.largest_h))
     return x
Exemple #35
0
def test_jit_class_using_function():
    LinearJIT = nnfusion.jit(torch.nn.Linear)

    model = LinearJIT(8, 8).cuda().eval()
    t = torch.randn(1, 8, device="cuda")
    assert_allclose(F.linear(t, model.weight, model.bias), model(t))
Exemple #36
0
 def forward(self, x):
     out = F.linear(F.normalize(x), F.normalize(self.weight))
     return out
Exemple #37
0
def _forward_rho(rho, incr, ctx, input, weight, bias):
    ctx.save_for_backward(input, weight, bias)
    ctx.rho = rho
    ctx.incr = incr
    return F.linear(input, weight, bias)
Exemple #38
0
def _forward_alpha_beta(ctx, input, weight, bias):
    Z = F.linear(input, weight, bias)
    ctx.save_for_backward(input, weight, bias)
    return Z
    def forward(self, x, vars=None, bn_training=True):
        """
        This function can be called by finetunning, however, in finetunning, we dont wish to update
        running_mean/running_var. Thought weights/bias of bn is updated, it has been separated by fast_weights.
        Indeed, to not update running_mean/running_var, we need set update_bn_statistics=False
        but weight/bias will be updated and not dirty initial theta parameters via fast_weiths.
        :param x: [b, 1, 28, 28]
        :param vars:
        :param bn_training: set False to not update
        :return: x, loss, likelihood, kld
        """

        if vars is None:
            vars = self.vars

        idx = 0
        bn_idx = 0

        for name, param in self.config:
            if name is 'conv2d':
                w, b = vars[idx], vars[idx + 1]
                # remember to keep synchrozied of forward_encoder and forward_decoder!
                x = F.conv2d(x, w, b, stride=param[4], padding=param[5])
                idx += 2
                # print(name, param, '\tout:', x.shape)
            elif name is 'convt2d':
                w, b = vars[idx], vars[idx + 1]
                # remember to keep synchrozied of forward_encoder and forward_decoder!
                x = F.conv_transpose2d(x,
                                       w,
                                       b,
                                       stride=param[4],
                                       padding=param[5])
                idx += 2
                # print(name, param, '\tout:', x.shape)
            elif name is 'linear':
                w, b = vars[idx], vars[idx + 1]
                x = F.linear(x, w, b)
                idx += 2
                # print('forward:', idx, x.norm().item())
            elif name is 'bn':
                w, b = vars[idx], vars[idx + 1]
                running_mean, running_var = self.vars_bn[bn_idx], self.vars_bn[
                    bn_idx + 1]
                x = F.batch_norm(x,
                                 running_mean,
                                 running_var,
                                 weight=w,
                                 bias=b,
                                 training=bn_training)
                idx += 2
                bn_idx += 2

            elif name is 'flatten':
                # print(x.shape)
                x = x.view(x.size(0), -1)
            elif name is 'reshape':
                # [b, 8] => [b, 2, 2, 2]
                x = x.view(x.size(0), *param)
            elif name is 'relu':
                x = F.relu(x, inplace=param[0])
            elif name is 'leakyrelu':
                x = F.leaky_relu(x, negative_slope=param[0], inplace=param[1])
            elif name is 'tanh':
                x = F.tanh(x)
            elif name is 'sigmoid':
                x = torch.sigmoid(x)
            elif name is 'upsample':
                x = F.upsample_nearest(x, scale_factor=param[0])
            elif name is 'max_pool2d':
                x = F.max_pool2d(x, param[0], param[1], param[2])
            elif name is 'avg_pool2d':
                x = F.avg_pool2d(x, param[0], param[1], param[2])

            else:
                raise NotImplementedError

        # make sure variable is used properly
        assert idx == len(vars)
        assert bn_idx == len(self.vars_bn)

        return x
Exemple #40
0
 def forward(self, x):
     return F.linear(x,
                     self.weight + self.sigma_weight * self.epsilon_weight,
                     self.bias + self.sigma_bias * self.epsilon_bias)
Exemple #41
0
 def f(x):
     for (weight, bias, act, norm) in zip(weights, biases, self.acts,
                                          self.layer_norms):
         x = norm(act(self.dropout(F.linear(x, weight, bias))))
     return x
Exemple #42
0
    def forward(self,
                prev_output_tokens,
                encoder_out_dict,
                incremental_state=None):
        encoder_out = encoder_out_dict['encoder_out']
        encoder_padding_mask = encoder_out_dict['encoder_padding_mask']

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
        bsz, seqlen = prev_output_tokens.size()

        # get outputs from encoder
        encoder_outs, _, _ = encoder_out[:3]
        srclen = encoder_outs.size(0)

        # embed tokens
        x = self.embed_tokens(prev_output_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # initialize previous states (or get from cache during incremental generation)
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            prev_hiddens, prev_cells, input_feed = cached_state
        else:
            _, encoder_hiddens, encoder_cells = encoder_out[:3]
            num_layers = len(self.layers)
            prev_hiddens = [encoder_hiddens[i] for i in range(num_layers)]
            prev_cells = [encoder_cells[i] for i in range(num_layers)]
            input_feed = x.data.new(bsz, self.encoder_output_units).zero_()

        attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
        outs = []
        for j in range(seqlen):
            # input feeding: concatenate context vector from previous time step
            input = torch.cat((x[j, :, :], input_feed), dim=1)

            for i, rnn in enumerate(self.layers):
                # recurrent cell
                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))

                # hidden state becomes the input to the next layer
                input = F.dropout(hidden,
                                  p=self.dropout_out,
                                  training=self.training)

                # save state for next time step
                prev_hiddens[i] = hidden
                prev_cells[i] = cell

            # apply attention using the last layer's hidden state
            if self.attention is not None:
                out, attn_scores[:, j, :] = self.attention(
                    hidden, encoder_outs, encoder_padding_mask)
            else:
                out = hidden
            out = F.dropout(out, p=self.dropout_out, training=self.training)

            # input feeding
            input_feed = out

            # save final output
            outs.append(out)

        # cache previous states (no-op except during incremental generation)
        utils.set_incremental_state(self, incremental_state, 'cached_state',
                                    (prev_hiddens, prev_cells, input_feed))

        # collect outputs across time steps
        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
        if not self.training and self.need_attn:
            attn_scores = attn_scores.transpose(0, 2)
        else:
            attn_scores = None

        # project back to size of vocabulary
        if self.adaptive_softmax is None:
            if hasattr(self, 'additional_fc'):
                x = self.additional_fc(x)
                x = F.dropout(x, p=self.dropout_out, training=self.training)
            if self.share_input_output_embed:
                x = F.linear(x, self.embed_tokens.weight)
            else:
                x = self.fc_out(x)
        return x, attn_scores
 def h_to_v(self, h):
     p_v = Function.sigmoid(Function.linear(h, self.W.t(), self.v_bias))
     sample_v = self.sample_p(p_v)
     return p_v, sample_v
def linear_biprec(input, weight, bias=None, num_bits_grad=None):
    out1 = F.linear(input.detach(), weight, bias)
    out2 = F.linear(input, weight.detach(),
                    bias.detach() if bias is not None else None)
    out2 = quantize_grad(out2, num_bits=num_bits_grad)
    return out1 + out2 - out1.detach()
 def v_to_h(self, v):
     p_h = Function.sigmoid(Function.linear(v, self.W, self.h_bias))
     sample_h = self.sample_p(p_h)
     return p_h, sample_h
 def forward(self, x):
     return F.linear(x, self.W_(), self.bias)
Exemple #47
0
 def forward(self, x):
     return F.linear(x, self.omega * self.weight, self.beta * self.bias)
 def forward(self, u, y=None):
     x = F.linear(u, self.weight, self.bias)
     if y is not None:
         x += F.linear(y, self.weight, self.bias)
     return x
Exemple #49
0
 def forward(self, x):
     out = F.linear(x, self.weight, self.bias)
     out = self.activation(out)
     return out
Exemple #50
0
 def forward(self, input):
     return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)
 def forward(self, x):
     W = self.weight * self.weight_scale / torch.sqrt(
         torch.sum(self.weight**2, dim=1, keepdim=True))
     return F.linear(x, W, self.bias)
Exemple #52
0
def IndRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None):
    hy = F.tanh(F.linear(input, w_ih, b_ih) + F.mul(w_hh, hidden))
    return hy
Exemple #53
0
def IndRNNReLuCell(input, hidden, w_ih, w_hh, b_ih=None):
    hy = F.relu(F.linear(input, w_ih, b_ih) + F.mul(w_hh, hidden))
    return hy
 def free_energy_cost(self, v):
     vbias_term = v.mv(self.v_bias)
     wx_b = Function.linear(v, self.W, self.h_bias)
     hidden_term = wx_b.exp().add(1).log().sum(1)
     return (-hidden_term - vbias_term).mean()
Exemple #55
0
 def decode(self, x):
     maskedweight = self.weight * self.mask
     catweight = torch.cat((maskedweight, self.weight_fc), dim=0)
     return F.linear(x, catweight.t(), self.vbias)
 def forward(self, features):
     cosine = F.linear(l2_norm(features), l2_norm(self.weight))
     return cosine
Exemple #57
0
 def forward(self, x):
     maskedweight = self.weight * self.mask
     catweight = torch.cat((maskedweight, self.weight_fc), dim=0)
     catbias = torch.cat((self.bias, self.bias_fc))
     return self.dropout(self.enc_act_func(F.linear(x, catweight, catbias)))
Exemple #58
0
def _forward_pattern(attribution, ctx, input, weight, bias, pattern):
    ctx.save_for_backward(input, weight, pattern)
    ctx.attribution = attribution
    return F.linear(input, weight, bias)
    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for input feeding/teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for
                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`

        Returns:
            tuple:
                - the last decoder layer's output of shape `(batch, tgt_len,
                  vocab)`
                - the last decoder layer's attention weights of shape `(batch,
                  tgt_len, src_len)`
        """
        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        attn = None

        inner_states = [x]

        # decoder layers
        for layer in self.layers:
            x, attn = layer(
                x,
                encoder_out['encoder_out'] if encoder_out is not None else None,
                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
                incremental_state,
                self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None,
            )
            inner_states.append(x)

        if self.normalize:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                x = F.linear(x, self.embed_tokens.weight)
            else:
                x = F.linear(x, self.embed_out)

        return x, {'attn': attn, 'inner_states': inner_states}
            def forward(self, x, weights=None, get_feat=False):
                if weights == None:
                    # block1
                    x = F.relu(self.bn1(self.conv1(x)))
                    x = F.relu(self.bn2(self.conv2(x)))
                    x = self.pool(x)
                    # block2
                    x = F.relu(self.bn3(self.conv3(x)))
                    x = F.relu(self.bn4(self.conv4(x)))
                    x = self.pool(x)
                    # block3
                    x = F.relu(self.bn5(self.conv5(x)))
                    x = F.relu(self.bn6(self.conv6(x)))
                    x = self.pool(x)

                    x = x.view(-1, 196 * 4 * 4)
                    feat = F.relu(self.bn7(self.fc1(x)))
                    x = self.fc2(feat)
                    if get_feat:
                        return x, feat
                    else:
                        return x
                else:
                    list_weights = list(weights.items())
                    count = 0
                    #block1
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn1.running_mean,
                                     self.bn1.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn2.running_mean,
                                     self.bn2.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.max_pool2d(x, kernel_size=2, stride=2)
                    #block2
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn3.running_mean,
                                     self.bn3.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn4.running_mean,
                                     self.bn4.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.max_pool2d(x, kernel_size=2, stride=2)
                    #block3
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn5.running_mean,
                                     self.bn5.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.conv2d(x,
                                 list_weights[count][1],
                                 list_weights[count + 1][1],
                                 padding=1,
                                 stride=1)
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn6.running_mean,
                                     self.bn6.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    x = F.threshold(x, 0, 0, inplace=True)
                    x = F.max_pool2d(x, kernel_size=2, stride=2)

                    x = x.view(-1, 196 * 4 * 4)
                    x = F.linear(x, list_weights[count][1],
                                 list_weights[count + 1][1])
                    count += 2
                    x = F.batch_norm(x,
                                     self.bn7.running_mean,
                                     self.bn7.running_var,
                                     list_weights[count][1],
                                     list_weights[count + 1][1],
                                     training=True)
                    count += 2
                    feat = F.threshold(x, 0, 0, inplace=True)
                    x = F.linear(feat, list_weights[count][1],
                                 list_weights[count + 1][1])
                    return x