Exemple #1
0
    def forward(self, images, questions):

        N, T, _, _, _ = images.size()

        # bs x 5 x 3 x 224 x 224
        img_feats = self.cnn(images.contiguous().view(
            -1, images.size(2), images.size(3), images.size(4)))
        img_feats = self.cnn_fc_layer(img_feats)

        img_feats_tr = self.img_tr(img_feats)

        ques_feats = self.q_rnn(questions)
        ques_feats_repl = ques_feats.view(N, 1, -1).repeat(1, T, 1)
        ques_feats_repl = ques_feats_repl.view(N * T, -1)

        ques_feats_tr = self.ques_tr(ques_feats_repl)

        ques_img_feats = torch.cat([ques_feats_tr, img_feats_tr], 1)

        att_feats = self.att(ques_img_feats)
        att_probs = F.softmax(att_feats.view(N, T), dim=1)
        att_probs2 = att_probs.view(N, T, 1).repeat(1, 1, 64)

        att_img_feats = torch.mul(att_probs2, img_feats.view(N, T, 64))
        att_img_feats = torch.sum(att_img_feats, dim=1)

        mul_feats = torch.mul(ques_feats, att_img_feats)

        scores = self.classifier(mul_feats)

        return scores, att_probs
    def forward(self, x):
        x = self.embed(x)
        x = self.dropout(x)
        # x = x.view(len(x), x.size(1), -1)
        # x = embed.view(len(x), embed.size(1), -1)
        bilstm_out, self.hidden = self.bilstm(x, self.hidden)

        bilstm_out = torch.transpose(bilstm_out, 0, 1)
        bilstm_out = torch.transpose(bilstm_out, 1, 2)
        # bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)).squeeze(2)
        bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2))
        bilstm_out = bilstm_out.squeeze(2)

        hidden2lable = self.hidden2label1(F.tanh(bilstm_out))

        gate_layer = F.sigmoid(self.gate_layer(bilstm_out))
        # calculate highway layer values
        gate_hidden_layer = torch.mul(hidden2lable, gate_layer)
        # if write like follow ,can run,but not equal the HighWay NetWorks formula
        # gate_input = torch.mul((1 - gate_layer), hidden2lable)
        gate_input = torch.mul((1 - gate_layer), bilstm_out)
        highway_output = torch.add(gate_hidden_layer, gate_input)

        logit = self.logit_layer(highway_output)

        return logit
Exemple #3
0
 def forward(self, theta, matches, return_outliers=False):
     if isinstance(theta,Variable): # handle normal batch transformations
         batch_size=theta.size()[0]
         theta=theta.clone()
         mask = self.geometricTnf(expand_dim(self.mask_id,0,batch_size),theta)
         if return_outliers:
             mask_outliers = self.geometricTnf(expand_dim(1.0-self.mask_id,0,batch_size),theta)
         if self.normalize:
             epsilon=1e-5
             mask = torch.div(mask,
                              torch.sum(torch.sum(torch.sum(mask+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask))
             if return_outliers:
                 mask_outliers = torch.div(mask_outliers,
                                           torch.sum(torch.sum(torch.sum(mask_outliers+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask_outliers))
         score = torch.sum(torch.sum(torch.sum(torch.mul(mask,matches),3),2),1)
         if return_outliers:
             score_outliers = torch.sum(torch.sum(torch.sum(torch.mul(mask_outliers,matches),3),2),1)
             return (score,score_outliers)
     elif isinstance(theta,list): # handle multiple transformations per batch item, batch is in list format (used for RANSAC)
         batch_size = len(theta)
         score = []
         for b in range(batch_size):
             sample_size=theta[b].size(0)
             s=self.forward(theta[b],expand_dim(matches[b,:,:,:].unsqueeze(0),0,sample_size))
             score.append(s)
     return score
def train_init(init_net, meta_alpha, loss_fn, image, target_bbox, evaluator):
    init_net.train()
    
    # Draw pos/neg samples
    pos_examples = gen_samples(SampleGenerator('gaussian', image.size, 0.1, 1.2),
                               target_bbox, opts['n_pos_init'], opts['overlap_pos_init'])
    neg_examples = np.concatenate([
                    gen_samples(SampleGenerator('uniform', image.size, 1, 2, 1.1), 
                                target_bbox, opts['n_neg_init']//2, opts['overlap_neg_init']),
                    gen_samples(SampleGenerator('whole', image.size, 0, 1.2, 1.1),
                                target_bbox, opts['n_neg_init']//2, opts['overlap_neg_init'])])
    # Crop images
    crop_size = opts['img_size']
    padding = opts['padding']
    image = np.asarray(image)
    pos_regions = extract_regions(image, pos_examples, crop_size, padding)
    neg_regions = extract_regions(image, neg_examples, crop_size, padding)
    pos_regions_var = Variable(torch.from_numpy(pos_regions[:opts['batch_pos']]))
    neg_regions_var = Variable(torch.from_numpy(neg_regions[:opts['batch_neg']]))
    if opts['use_gpu']:
        pos_regions_var = pos_regions_var.cuda()
        neg_regions_var = neg_regions_var.cuda()
    
    # training
    tracker_init_weights = OrderedDict((name, param) for (name, param) in init_net.named_parameters())
    tracker_keys = [name for (name, _) in init_net.named_parameters()]
    # the first iteration
    pos_score = init_net.forward(pos_regions_var)
    neg_score = init_net.forward(neg_regions_var)
    init_loss = loss_fn(pos_score,neg_score)
    init_acc,init_acc_pos,init_acc_neg = evaluator(pos_score, neg_score)
    grads = torch.autograd.grad(init_loss, tracker_init_weights.values(), create_graph=True)
    tracker_weights = OrderedDict((name, param - torch.mul(alpha,grad)) for
                                  ((name, param),(_,alpha),grad) in
                                  zip(tracker_init_weights.items(),
                                      meta_alpha.items(), grads))
    # rest of iterations
    for i in range(opts['n_init_updates']-1):
        pos_score = init_net.forward(pos_regions_var, tracker_weights)
        neg_score = init_net.forward(neg_regions_var, tracker_weights)
        loss = loss_fn(pos_score,neg_score)
        grads = torch.autograd.grad(loss, tracker_weights.values(), create_graph=True)
        tracker_weights = OrderedDict((name, param - torch.mul(alpha,grad))
                                      for ((name, param),(_,alpha),grad) in
                                      zip(tracker_weights.items(),meta_alpha.items(), grads))
    # update tracker
    init_net.copy_meta_weights(tracker_weights)
    init_net.eval()
    pos_score = init_net.forward(pos_regions_var)
    neg_score = init_net.forward(neg_regions_var)
    acc,acc_pos,acc_neg = evaluator(pos_score, neg_score)

    pos_regions_var = Variable(torch.from_numpy(pos_regions))
    neg_regions_var = Variable(torch.from_numpy(neg_regions))
    if opts['use_gpu']:
        pos_regions_var = pos_regions_var.cuda()
        neg_regions_var = neg_regions_var.cuda()
    pos_feats = init_net(pos_regions_var, out_layer='features')
    neg_feats = init_net(neg_regions_var, out_layer='features')
    return pos_feats.data.clone(), neg_feats.data.clone(), init_acc, acc
Exemple #5
0
    def updateGradInput(self, input, gradOutput):
        v1 = input[0]
        v2 = input[1]
        v1, v2 = self._makeContiguous(v1, v2)

        if len(self.gradInput) != 2:
            if self.gradInput[0] is None:
                self.gradInput[0] = v1.new()
            if self.gradInput[1] is None:
                self.gradInput[1] = v1.new()
            self.gradInput = self.gradInput[:2]

        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v1).copy_(v1)

        torch.mul(self.w1, self.w22, out=self.buffer)
        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
        gw1.mul_(self.w.expand_as(v1))

        torch.mul(self.w1, self.w32, out=self.buffer)
        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
        gw2.mul_(self.w.expand_as(v1))

        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
        gw1.mul_(go)
        gw2.mul_(go)

        return self.gradInput
    def __call__(self, image_batch, theta_aff, theta_aff_tps, use_cuda=True):
        
        sampling_grid_aff = self.affTnf(image_batch=None,
                                        theta_batch=theta_aff.view(-1,2,3),
                                        return_sampling_grid=True,
                                        return_warped_image=False)
      
        sampling_grid_aff_tps = self.tpsTnf(image_batch=None,
                                       theta_batch=theta_aff_tps,
                                       return_sampling_grid=True,
                                       return_warped_image=False)
        
        if self.padding_crop_factor is not None:
            sampling_grid_aff_tps = sampling_grid_aff_tps*self.padding_crop_factor;

        # put 1e10 value in region out of bounds of sampling_grid_aff
        in_bound_mask_aff = ((sampling_grid_aff[:,:,:,0]>-1) * (sampling_grid_aff[:,:,:,0]<1) * (sampling_grid_aff[:,:,:,1]>-1) * (sampling_grid_aff[:,:,:,1]<1)).unsqueeze(3)
        in_bound_mask_aff = in_bound_mask_aff.expand_as(sampling_grid_aff)
        sampling_grid_aff = torch.mul(in_bound_mask_aff.float(),sampling_grid_aff)
        sampling_grid_aff = torch.add((in_bound_mask_aff.float()-1)*(1e10),sampling_grid_aff)       
        
        # compose transformations
        sampling_grid_aff_tps_comp = F.grid_sample(sampling_grid_aff.transpose(2,3).transpose(1,2), sampling_grid_aff_tps).transpose(1,2).transpose(2,3)
            
        # put 1e10 value in region out of bounds of sampling_grid_aff_tps_comp
        in_bound_mask_aff_tps=((sampling_grid_aff_tps[:,:,:,0]>-1) * (sampling_grid_aff_tps[:,:,:,0]<1) * (sampling_grid_aff_tps[:,:,:,1]>-1) * (sampling_grid_aff_tps[:,:,:,1]<1)).unsqueeze(3)
        in_bound_mask_aff_tps=in_bound_mask_aff_tps.expand_as(sampling_grid_aff_tps_comp)
        sampling_grid_aff_tps_comp=torch.mul(in_bound_mask_aff_tps.float(),sampling_grid_aff_tps_comp)
        sampling_grid_aff_tps_comp = torch.add((in_bound_mask_aff_tps.float()-1)*(1e10),sampling_grid_aff_tps_comp)       

        # sample transformed image
        warped_image_batch = F.grid_sample(image_batch, sampling_grid_aff_tps_comp)
        
        return warped_image_batch
Exemple #7
0
    def forward(self, input1):
        self.batchgrid3d = torch.zeros(torch.Size([input1.size(0)]) + self.grid3d.size())

        for i in range(input1.size(0)):
            self.batchgrid3d[i] = self.grid3d

        self.batchgrid3d = Variable(self.batchgrid3d)
        #print(self.batchgrid3d)

        x = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,0:4]), 3)
        y = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,4:8]), 3)
        z = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,8:]), 3)
        #print(x)
        r = torch.sqrt(x**2 + y**2 + z**2) + 1e-5

        #print(r)
        theta = torch.acos(z/r)/(np.pi/2)  - 1
        #phi = torch.atan(y/x)
        phi = torch.atan(y/(x + 1e-5))  + np.pi * x.lt(0).type(torch.FloatTensor) * (y.ge(0).type(torch.FloatTensor) - y.lt(0).type(torch.FloatTensor))
        phi = phi/np.pi


        output = torch.cat([theta,phi], 3)

        return output
    def forward(self, title, pg):

        r_gate = F.sigmoid(self.wrx(title) + self.wrh(pg))
        i_gate = F.sigmoid(self.wix(title) + self.wih(pg))
        n_gate = F.tanh(self.wnx(title) + torch.mul(r_gate, self.wnh(pg)))
        result =  torch.mul(i_gate, pg) + torch.mul(torch.add(-i_gate, 1), n_gate)
        return result
    def updateGradInput(self, input, y):
        v1 = input[0]
        v2 = input[1]

        gw1 = self.gradInput[0]
        gw2 = self.gradInput[1]
        gw1.resize_as_(v1).copy_(v2)
        gw2.resize_as_(v1).copy_(v1)

        torch.mul(self.w1, self.w22, out=self.buffer)
        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
        gw1.mul_(self.w.expand_as(v1))

        torch.mul(self.w1, self.w32, out=self.buffer)
        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
        gw2.mul_(self.w.expand_as(v1))

        # self._idx = self._outputs <= 0
        torch.le(self._outputs, 0, out=self._idx)
        self._idx = self._idx.view(-1, 1).expand(gw1.size())
        gw1[self._idx] = 0
        gw2[self._idx] = 0

        torch.eq(y, 1, out=self._idx)
        self._idx = self._idx.view(-1, 1).expand(gw2.size())
        gw1[self._idx] = gw1[self._idx].mul_(-1)
        gw2[self._idx] = gw2[self._idx].mul_(-1)

        if self.sizeAverage:
            gw1.div_(y.size(0))
            gw2.div_(y.size(0))

        return self.gradInput
 def forward(self, inputs, targets, step, weight_constraint_lambda, logger):
     n = inputs.size(0)
     # Compute pairwise distance, replace by the official when merged
     # features = F.normalize(inputs)
     features = inputs
     dist = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
     dist = dist + dist.t()
     dist.addmm_(1, -2, features, features.t())
     dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
     # get the positive label mask
     mask = targets.expand(n, n).eq(targets.expand(n, n).t())
     mask = mask.float()
     positive_dist = torch.mul(dist, mask)
     negative_dist = torch.mul(mask, dist.max()) + torch.mul(dist, 1 - mask)
     indexes_ap = []
     indexes_ng = []
     dist_ap = []
     dist_an = []
     for i in range(n):
         pos_dist, pos_index = positive_dist[i].max(0)
         neg_dist, neg_index = negative_dist[i].min(0)
         dist_ap.append(pos_dist)
         dist_an.append(neg_dist)
         indexes_ap.append(pos_index)
         indexes_ng.append(neg_index)
     dist_ap = torch.cat(dist_ap)
     dist_an = torch.cat(dist_an)
     indexes_ap = torch.cat(indexes_ap)
     indexes_ng = torch.cat(indexes_ng)
     pair_adp_inputs = []
     for i in range(n):
         pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ap.data[i], :]]))
     # for i in range(n):
         pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ng.data[i], :]]))
     pair_adp_inputs = torch.stack(pair_adp_inputs)
     # Compute adp_pairwise distance, replace by the official when merged
     dist_adp = self.AdpsubM(pair_adp_inputs, n)  # [2*batchsize] [ap,ng]*batchsize
     # dist_constraint = torch.norm(dist-dist.t())
     dist_ap_adp = dist_adp[::2]
     dist_an_adp = dist_adp[1::2]
     # Compute ranking hinge loss
     y = dist_an.data.new()
     y.resize_as_(dist_an.data)
     y.fill_(1)
     y = Variable(y)
     # dist_neg_constr = 1/torch.norm(dist[mask==0])
     trip_loss = self.softmargin_loss(dist_an - dist_ap, y)
     trip_loss_adp = self.softmargin_loss(dist_an_adp - dist_ap_adp, y)
     loss = trip_loss + trip_loss_adp
     # loss = trip_loss
     if logger:
         # logger.scalar_summary('Metric_constraint', Metric_constraint.data[0], step)
         # logger.scalar_summary('dist_constraint', dist_constraint.data[0], step)
         # logger.histo_summary('W',W.data.cpu().numpy(),step)
         logger.histo_summary('dist_apt', dist_adp.data.cpu().numpy(), step)
         logger.histo_summary('dist', dist.data.cpu().numpy(), step)
         logger.scalar_summary('trip_loss', trip_loss.data[0], step)
     prec = (dist_an.data > dist_ap.data).sum() * 1. / y.size(0)
     return trip_loss_adp, prec
Exemple #11
0
 def forward(self, x):
     x0 = self.conv.forward(x.float())
     x = self.pool_mil(x0)
     x = x.squeeze(2).squeeze(2)
     x1 = torch.add(torch.mul(x0.view(x.size(0), 1000, -1), -1), 1)
     cumprod = torch.cumprod(x1, 2)
     out = torch.max(x, torch.add(torch.mul(cumprod[:, :, -1], -1), 1))
     #out = F.softmax(out)
     return out
Exemple #12
0
 def forward(self, img, att_size=14):
     x0 = self.conv(img)
     x = self.pool_mil(x0)
     x = x.squeeze(2).squeeze(2)
     x = self.l1(x)
     x1 = torch.add(torch.mul(x.view(x.size(0), 1000, -1), -1), 1)
     cumprod = torch.cumprod(x1, 2)
     out = torch.max(x, torch.add(torch.mul(cumprod[:, :, -1], -1), 1))
     return out
Exemple #13
0
    def updateOutput(self, input):
        input1, input2 = input[0], input[1]

        if self.buffer is None:
            self.buffer = input1.new()

        torch.mul(input1, input2, out=self.buffer)
        torch.sum(self.buffer, 1, True, out=self.output)
        self.output.resize_(input1.size(0))
        return self.output
Exemple #14
0
    def custom_cross_entropy(x, y):
        sigmoid_x = torch.sigmoid(x)
        sigmoid_x2 = torch.sigmoid(x ** 2)
        neg_log_sigmoid_x = -1 * torch.log(sigmoid_x)
        neg_log_1_minus_sigmoid_x2 = -1 * torch.log(1 - sigmoid_x2)

        l1 = torch.mul(y, neg_log_sigmoid_x)
        l2 = torch.mul(1 - y, neg_log_1_minus_sigmoid_x2)

        return torch.sum(l1 + l2)
    def forward(self, x):
        bahs, chs, _, _ = x.size()

        # Returns a new tensor with the same data as the self tensor but of a different size.
        chn_se = self.avg_pool(x).view(bahs, chs)
        chn_se = self.channel_excitation(chn_se).view(bahs, chs, 1, 1)
        chn_se = torch.mul(x, chn_se)

        spa_se = self.spatial_se(x)
        spa_se = torch.mul(x, spa_se)
        return torch.add(chn_se, 1, spa_se)
Exemple #16
0
    def updateOutput(self, input):
        gaterInput, expertInputs = input

        # buffers
        if self._gaterView is None:
            self._gaterView = input[0].new()
        if self._expert is None:
            self._expert = input[0].new()
        if self._expertView is None:
            self._expertView = input[0].new()

        self.dimG = 1
        batchSize = gaterInput.size(0)

        if self.table or isinstance(expertInputs, list):
            self.table = True
            if gaterInput.size(self.dimG) != len(expertInputs):
                raise RuntimeError("Should be one gater output per expert")

            expertInput = expertInputs[0]
            if self.batchSize != batchSize:
                size = [1] * (expertInput.dim() + 1)
                if self.dimG > 0:
                    size[0] = gaterInput.size(0)
                size[self.dim] = gaterInput.size(self.dimG)
                self.size = torch.Size(size)
                self.output.resize_as_(expertInput)
                self.backwardSetup = False
                self.batchSize = batchSize

            self._gaterView = gaterInput.view(self.size)
            self.output.zero_()
            # multiply accumulate gater outputs by their commensurate expert
            for i, expertInput in enumerate(expertInputs):
                gate = self._gaterView.select(self.dim, i).expand_as(expertInput)
                self.output.addcmul_(expertInput, gate)
        else:
            if self.batchSize != batchSize:
                size = [1] * expertInputs.dim()
                if self.dimG > 0:
                    size[0] = gaterInput.size(0)
                size[self.dim] = gaterInput.size(self.dimG)
                self.size = torch.Size(size)
                self.output.resize_as_(expertInputs.select(self.dim, 0))
                self.batchSize = batchSize
                self.backwardSetup = False

            self._gaterView = gaterInput.view(self.size)
            torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert)
            torch.sum(self._expert, self.dim, True, out=self.output)
            self.output.resize_as_(expertInputs.select(self.dim, 0))

        return self.output
    def accGradParameters(self, input, gradOutput, scale=1):
        inputSize, outputSize = self.weight.size(0), self.weight.size(1)

        """
        dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
        ---- = -------------------------- = ---------------------
        dw_j    2 || c_j * (w_j - x) ||             y_j

        dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2
        ---- = ----------------------- = -----------------
        dc_j   2 || c_j * (w_j - x) ||         y_j
        #"""
        # assumes a preceding call to updateGradInput
        if input.dim() == 1:
            self.gradWeight.add_(-scale, self._repeat2)

            self._repeat.div_(self.diagCov)
            self._repeat.mul_(self._repeat)
            self._repeat.mul_(self.diagCov)

            if torch.type(input) == 'torch.cuda.FloatTensor':
                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
                self._repeat2.mul_(self._repeat)
            else:
                torch.mul(self._repeat, self._expand4, out=self._repeat2)

            self.gradDiagCov.add_(self._repeat2)
        elif input.dim() == 2:
            if self._sum is None:
                self._sum = input.new()
            torch.sum(self._repeat2, 0, True, out=self._sum)
            self._sum.resize_(inputSize, outputSize)
            self.gradWeight.add_(-scale, self._sum)

            if input.type() == 'torch.cuda.FloatTensor':
                # requires lots of memory, but minimizes cudaMallocs and loops
                self._repeat.div_(self._repeat3)
                self._repeat.mul_(self._repeat)
                self._repeat.mul_(self._repeat3)
                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
                self._repeat.mul_(self._repeat2)
            else:
                self._repeat.div_(self._expand3)
                self._repeat.mul_(self._repeat)
                self._repeat.mul_(self._expand3)
                self._repeat.mul_(self._expand4)

            torch.sum(self._repeat, 0, True, out=self._sum)
            self._sum.resize_(inputSize, outputSize)
            self.gradDiagCov.add_(scale, self._sum)
        else:
            raise RuntimeError("1D or 2D input expected")
Exemple #18
0
    def forward(self, input1):
        self.batchgrid = torch.zeros(torch.Size([input1.size(0)]) + self.grid.size())

        for i in range(input1.size(0)):
            self.batchgrid[i] = self.grid

        self.batchgrid = Variable(self.batchgrid)
        #print self.batchgrid,  input1[:,:,:,0:3]
        #print self.batchgrid,  input1[:,:,:,4:6]
        x = torch.mul(self.batchgrid, input1[:,:,:,0:3])
        y = torch.mul(self.batchgrid, input1[:,:,:,3:6])

        output = torch.cat([torch.sum(x,3),torch.sum(y,3)], 3)
        return output
Exemple #19
0
    def updateOutput(self, input):
        input1, input2 = input[0], input[1]
        input1, input2 = self._makeContiguous(input1, input2)

        if self.buffer is None:
            self.buffer = input1.new()
            self.w1 = input1.new()
            self.w22 = input1.new()
            self.w = input1.new()
            self.w32 = input1.new()
            self.ones = input1.new()

        torch.mul(input1, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)

        epsilon = 1e-12
        torch.mul(input1, input1, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
        self.w22.reciprocal_()
        self.w.resize_as_(self.w22).copy_(self.w22)

        torch.mul(input2, input2, out=self.buffer)
        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
        self.w32.reciprocal_()
        self.w.mul_(self.w32)
        self.w.sqrt_()

        torch.mul(self.w1, self.w, out=self.output)
        self.output.resize_(input1.size(0))

        return self.output
Exemple #20
0
    def accGradParameters(self, input, gradOutput, scale=1):
        if self._input is None:
            self._input = input.new()
            self._gradWeight = input.new()
            self._sum = input.new()

        batchSize = input.size(0)
        contiguousView(self._input, input, batchSize, -1)
        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
        self._gradWeight = self.gradWeight.view(1, -1)

        torch.mul(self._input, self._gradOutput, out=self._repeat)
        torch.sum(self._repeat, 0, True, out=self._sum)
        self._gradWeight.add_(scale, self._sum)
Exemple #21
0
    def accGradParameters(self, input, gradOutput, scale=1):
        self._assertInputGradOutput(input, gradOutput)

        # make sure we have buffer:
        if self.buff1 is None:
            self.buff1 = input[0].new()
        self.buff1.resize_as_(input[0])

        # accumulate parameter gradients:
        for k in range(self.weight.size(0)):
            torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1)
            self.gradWeight[k].addmm_(self.buff1.t(), input[1])

        if self.bias is not None:
            self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
Exemple #22
0
    def backward(self, grad_output):
        input, output = self.saved_tensors
        grad_input = grad_output.new()

        if self._backend is not None:
            self._backend.SpatialCrossMapLRN_updateGradInput(
                self._backend.library_state,
                input,
                grad_output,
                grad_input,
                self.scale,
                output,
                self.size,
                self.alpha,
                self.beta,
                self.k
            )
        else:
            batch_size = input.size(0)
            channels = input.size(1)
            input_height = input.size(2)
            input_width = input.size(3)

            paddded_ratio = input.new(channels + self.size - 1, input_height,
                                      input_width)
            accum_ratio = input.new(input_height, input_width)

            cache_ratio_value = 2 * self.alpha * self.beta / self.size
            inversePrePad = int(self.size - (self.size - 1) / 2)

            grad_input.resize_as_(input)
            torch.pow(self.scale, -self.beta, out=grad_input).mul_(grad_output)

            paddded_ratio.zero_()
            padded_ratio_center = paddded_ratio.narrow(0, inversePrePad,
                                                       channels)
            for n in range(batch_size):
                torch.mul(grad_output[n], output[n], out=padded_ratio_center)
                padded_ratio_center.div_(self.scale[n])
                torch.sum(
                    paddded_ratio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=accum_ratio)
                for c in range(channels):
                    accum_ratio.add_(paddded_ratio[c + self.size - 1])
                    grad_input[n][c].addcmul_(-cache_ratio_value, input[n][c],
                                              accum_ratio)
                    accum_ratio.add_(-1, paddded_ratio[c])

        return grad_input
 def forward(self, input_data):
     # input_data: batch_size * T - 1 * input_size
     input_weighted = Variable(input_data.data.new(input_data.size(0), self.T - 1, self.input_size).zero_())
     input_encoded = Variable(input_data.data.new(input_data.size(0), self.T - 1, self.hidden_size).zero_())
     # hidden, cell: initial states with dimention hidden_size
     hidden = self.init_hidden(input_data) # 1 * batch_size * hidden_size
     cell = self.init_hidden(input_data)
     # hidden.requires_grad = False
     # cell.requires_grad = False
     for t in range(self.T - 1):
         # Eqn. 8: concatenate the hidden states with each predictor
         x = torch.cat((hidden.repeat(self.input_size, 1, 1).permute(1, 0, 2),
                        cell.repeat(self.input_size, 1, 1).permute(1, 0, 2),
                        input_data.permute(0, 2, 1)), dim = 2) # batch_size * input_size * (2*hidden_size + T - 1)
         # Eqn. 9: Get attention weights
         x = self.attn_linear(x.view(-1, self.hidden_size * 2 + self.T - 1)) # (batch_size * input_size) * 1
         attn_weights = F.softmax(x.view(-1, self.input_size)) # batch_size * input_size, attn weights with values sum up to 1.
         # Eqn. 10: LSTM
         weighted_input = torch.mul(attn_weights, input_data[:, t, :]) # batch_size * input_size
         # Fix the warning about non-contiguous memory
         # see https://discuss.pytorch.org/t/dataparallel-issue-with-flatten-parameter/8282
         self.lstm_layer.flatten_parameters()
         _, lstm_states = self.lstm_layer(weighted_input.unsqueeze(0), (hidden, cell))
         hidden = lstm_states[0]
         cell = lstm_states[1]
         # Save output
         input_weighted[:, t, :] = weighted_input
         input_encoded[:, t, :] = hidden
     return input_weighted, input_encoded
Exemple #24
0
    def forward(self, inputs, input_lengths):
        """ Forward pass.

        # Arguments:
            inputs (Torch.Variable): Tensor of input sequences
            input_lengths (torch.LongTensor): Lengths of the sequences

        # Return:
            Tuple with (representations and attentions if self.return_attention else None).
        """
        logits = inputs.matmul(self.attention_vector)
        unnorm_ai = (logits - logits.max()).exp()

        # Compute a mask for the attention on the padded sequences
        # See e.g. https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/5
        max_len = unnorm_ai.size(1)
        idxes = torch.arange(0, max_len, out=torch.LongTensor(max_len)).unsqueeze(0)
        mask = Variable((idxes < input_lengths.unsqueeze(1)).float())

        # apply mask and renormalize attention scores (weights)
        masked_weights = unnorm_ai * mask
        att_sums = masked_weights.sum(dim=1, keepdim=True)  # sums per sequence
        attentions = masked_weights.div(att_sums)

        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(dim=1)

        return (representations, attentions if self.return_attention else None)
    def routing(self, x, b_IJ, W,batch_size,routing_iter):
        x1 = x.view(batch_size, 256, 1, 6, 6)
        x_tile = x1.repeat(1, 1, 10, 1, 1)
        x_view = x_tile.view(batch_size, 1152, 10, 8, 1)
        stride_i = W.repeat(batch_size, 1, 1, 1, 1)
        stride_j = stride_i.view(batch_size, 1152, 10, 16, 8)
        dot_op = torch.matmul(stride_j, x_view)
        dot_op_stopped = Variable(dot_op.data.clone(), requires_grad=False)

        for r_iter in range(routing_iter):
            id_capsule = F.softmax(b_IJ, dim=2)
            if r_iter == routing_iter - 1:
                route_I = torch.mul(id_capsule, dot_op)
                route_I_sum = torch.sum(route_I, dim=1, keepdim=True) + self.bias
                V_J = squash(route_I_sum,self.epsilon)
            if r_iter < routing_iter - 1:

                dot_op_stopped_tmp = dot_op_stopped.data.numpy()
                dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 16, 1))
                id_capsule_tmp = id_capsule.data.numpy()
                route_I_tmp = id_capsule_tmp * dot_op_stopped_tmp
                route_I_tmp_sum = np.sum(route_I_tmp, axis=1, keepdims=True) + self.bias.data.numpy()
                V_J_tmp = squash(torch.Tensor(route_I_tmp_sum),self.epsilon)

                V_J_tmp_tiled = np.tile(V_J_tmp.numpy(), (1, 1152, 1, 1, 1))
                dot_op_stopped_tmp = np.reshape(dot_op_stopped_tmp, (batch_size, 1152, 10, 1, 16))

                u_produce_v = np.matmul(dot_op_stopped_tmp, V_J_tmp_tiled)

                b_IJ.data += torch.Tensor(u_produce_v)

        return V_J
    def updateGradInput(self, input, gradOutput):
        assert input.dim() == 4

        if input.type() == 'torch.cuda.FloatTensor':
            self._backend.SpatialCrossMapLRN_updateGradInput(
                self._backend.library_state,
                input,
                gradOutput,
                self.gradInput,
                self.scale,
                self.output,
                self.size,
                self.alpha,
                self.beta,
                self.k
            )
        else:
            batchSize = input.size(0)
            channels = input.size(1)
            inputHeight = input.size(2)
            inputWidth = input.size(3)

            if self.paddedRatio is None:
                self.paddedRatio = input.new()
            if self.accumRatio is None:
                self.accumRatio = input.new()
            self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth)
            self.accumRatio.resize_(inputHeight, inputWidth)

            cacheRatioValue = 2 * self.alpha * self.beta / self.size
            inversePrePad = int(self.size - (self.size - 1) / 2)

            self.gradInput.resize_as_(input)
            torch.pow(self.scale, -self.beta, out=self.gradInput).mul_(gradOutput)

            self.paddedRatio.zero_()
            paddedRatioCenter = self.paddedRatio.narrow(0, inversePrePad, channels)
            for n in range(batchSize):
                torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter)
                paddedRatioCenter.div_(self.scale[n])
                torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=self.accumRatio)
                for c in range(channels):
                    self.accumRatio.add_(self.paddedRatio[c + self.size - 1])
                    self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio)
                    self.accumRatio.add_(-1, self.paddedRatio[c])

        return self.gradInput
def train_init(tracker_net, meta_alpha, loss_fn, pos_regions, neg_regions, lh_pos_regions,
               lh_neg_regions, evaluator, train_all=False):
    if train_all:
        tracker_init_weights = OrderedDict((name, param) for (name, param) in tracker_net.named_parameters())
        tracker_keys = [name for (name, _) in tracker_net.named_parameters()]
    else:
        tracker_init_weights = OrderedDict((name, param) for (name, param) in tracker_net.named_parameters() if name.startswith('fc') )
        tracker_keys = [name for (name, _) in tracker_net.named_parameters() if name.startswith('fc')]

    # the first iteration
    pos_score = tracker_net.forward(pos_regions)
    neg_score = tracker_net.forward(neg_regions)
    loss = loss_fn(pos_score,neg_score)
    grads = torch.autograd.grad(loss, tracker_init_weights.values(), create_graph=True)
    tracker_weights = OrderedDict((name, param - torch.mul(meta_alpha,grad)) for
                                  ((name, param),(_,meta_alpha),grad) in
                                  zip(tracker_init_weights.items(),
                                      meta_alpha.items(), grads))

    for i in range(opts['n_init_updates']-1):
        pos_score = tracker_net.forward(pos_regions, tracker_weights)
        neg_score = tracker_net.forward(neg_regions, tracker_weights)
        loss = loss_fn(pos_score,neg_score)
        grads = torch.autograd.grad(loss, tracker_weights.values(), create_graph=True)
        tracker_weights = OrderedDict((name, param - torch.mul(meta_alpha,grad))
                                      for ((name, param),(_,meta_alpha),grad) in
                                      zip(tracker_weights.items(),meta_alpha.items(), grads))

    lh_pos_score = tracker_net.forward(lh_pos_regions, tracker_weights)
    lh_neg_score = tracker_net.forward(lh_neg_regions, tracker_weights)
    lh_loss = loss_fn(lh_pos_score,lh_neg_score)
    lh_acc,lh_acc_pos,lh_acc_neg = evaluator(lh_pos_score, lh_neg_score)
    
    pos_score = tracker_net.forward(pos_regions, tracker_weights)
    neg_score = tracker_net.forward(neg_regions, tracker_weights)
    loss = loss_fn(pos_score,neg_score)
    acc,acc_pos,acc_neg = evaluator(pos_score, neg_score)

    # compute meta grads for lookahead dataset
    grads = torch.autograd.grad(lh_loss, tracker_init_weights.values(), retain_graph=True)
    alpha_grads = torch.autograd.grad(lh_loss, meta_alpha.values())
    meta_init_grads = {}
    meta_alpha_grads = {}
    for i in range(len(tracker_keys)):
        meta_init_grads[tracker_keys[i]] = grads[i]
        meta_alpha_grads[tracker_keys[i]] = alpha_grads[i]
    return meta_init_grads, meta_alpha_grads, loss.data[0], lh_loss.data[0], acc, lh_acc
    def updateGradInput(self, input, y):
        self.gradInput.resize_as_(input).copy_(y)
        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0

        if self.sizeAverage:
            self.gradInput.mul_(1. / input.nelement())

        return self.gradInput
Exemple #29
0
    def forward(self, lvec, rvec):
        mult_dist = torch.mul(lvec, rvec)
        abs_dist = torch.abs(torch.add(lvec, -rvec))
        vec_dist = torch.cat((mult_dist, abs_dist), 1)

        out = F.sigmoid(self.wh(vec_dist))
        out = F.log_softmax(self.wp(out))
        return out
    def get_vert_context(self, vert_state, edge_state):
        verb_vert_state = vert_state[0]
        region_vert_state = vert_state[1:]
        verb_expanded_state = verb_vert_state.expand(region_vert_state.size(0), verb_vert_state.size(0))

        #print('vert shapes', verb_vert_state.size(), region_vert_state.size(), verb_expanded_state.size())

        verb_concat = torch.mul(verb_expanded_state, edge_state)
        region_concat = torch.mul(region_vert_state, edge_state)

        att_weighted_verb_per_edge = torch.mul(self.vert_att(verb_concat), edge_state)
        att_weighted_region = torch.mul(self.edge_att(region_concat), edge_state)
        att_weighted_verb = torch.sum(att_weighted_verb_per_edge, 0)

        vert_ctx = torch.cat((torch.unsqueeze(att_weighted_verb,0),att_weighted_region), 0)

        #print('vert context :', vert_ctx.size())
        return vert_ctx
Exemple #31
0
 def forward(self, x):
     # padding = self.dilation - (x.shape[-1] + self.dilation - 1) % self.dilation
     x = F.pad(x, (self.dilation, 0))
     return torch.mul(self.tanh(self.conv_f(x)), self.sig(self.conv_g(x)))
Exemple #32
0
 def forward(self, input, target, N_D, N_R):
     return torch.sum(-1. * torch.mul(target, input) + N_D / float(N_R) *
                      torch.mul((1 - target), (torch.exp(input) - 1)))
Exemple #33
0
def convert_from_tanh(input_tensor):
    out_tensor = torch.add(input_tensor, 1)
    out_tensor = torch.mul(out_tensor, 255 / 2)
    return out_tensor
Exemple #34
0
 def forward(self, shading, albedo):
     self.shading = shading.repeat(1, self.nc, 1, 1)
     self.img = torch.mul(self.shading, albedo)
     return self.img
Exemple #35
0
def nms(boxes, scores, overlap=0.5, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """

    keep = torch.Tensor(scores.size(0)).fill_(0).long()
    if boxes.numel() == 0:
        return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w * h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter / union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count
    def forward(self, hidden, y_label, input_mask):
        """
        hidden: batch_size_x * (1 + batch_size_y) * max_seq_length * h_dim
        y_label: batch_size_x * batch_size_y
        input_mask: batch_size_x * (1 + batch_size_y) * max_seq_length
        """

        hidden = normalize(hidden)
        hidden = hidden.masked_fill(mask=~input_mask.unsqueeze(-1).expand_as(hidden), value=torch.tensor(0))

        x = hidden[:,0,:,:] # x * seq * dim
        y = hidden[:,1:,:,:] # x * y * seq * dim

        #h[x][0][m][q]
        #h[x][i][n][q]
        #h[x][i][m] = \sum n,q h[x][0][m][q] * h[x][i][n][q]

        """
        a[x][p][q]
        b[y][i][q]
        c[x][y][p] = \sum i,q a_xpq * b_yiq
        """

        x_score = x.unsqueeze(1) # x * 1 * seq * dim
        y_score = torch.sum(y, dim=2) # x * y * dim

        x_mask = input_mask[:,0,:] # x * seq

        score = self.attn_fn(x_score, y_score) / self.temperature # x * y * seq
        score = score.masked_fill(mask=~x_mask.unsqueeze(1).expand_as(score),value=float('-1e8'))
        attn = F.softmax(score, dim=2).unsqueeze(-1) # batch_size_x * batch_size_y * max_seq_length * 1

        x_expand = x.unsqueeze(-1).expand(x.size(0), x.size(1), x.size(2), y.size(1)) # batch_size_x * max_seq_length * h_dim * batch_size_y
        x_perm = x_expand.permute(0, 3, 2, 1) # batch_size_x * batch_size_y * h_dim * max_seq_length

        #print(hidden.shape, x.shape, x_perm.shape, y.shape, attn.shape)

        x_embed = torch.matmul(x_perm, attn).squeeze(-1)

        if not self.cat:
            x_embed = normalize(x_embed) # batch_size_x * batch_size_y * h_dim
        else:
            x_embed = normalize(torch.cat((x_embed, x[:,-1,:].unsqueeze(1).expand_as(x_embed)), dim=-1))

        #y_embed = normalize(torch.sum(y, dim=1)) # batch_size_y * h_dim
        y_embed = y[:,:,-1,:] # x * y * dim
        #y_embed_expand = y_embed.unsqueeze(0).expand(x_embed.size(0), y_embed.size(0), y_embed.size(1)) # batch_size_x * batch_size_y * h_dim

        sim = self.sim_fn(x_embed, y_embed) # x * y
        mse_loss = self.mse_loss(sim, y_label)
        weight = torch.ones_like(sim)
        weight[y_label==1] = self.pos_weight
        mse = torch.mean(torch.mul(mse_loss, weight))
        mse = torch.mean(mse_loss)

        #print(sim.shape, y_label.shape)
        pos_wrong = sum(sim[y_label==1.] < 0).item()
        neg_wrong = sum(sim[y_label==-1.] > 0).item()
        pos_count = len(y_label[y_label==1.])
        neg_count = len(y_label[y_label==-1.])

        return mse, (attn.squeeze(-1), sim, y_label, pos_wrong, neg_wrong, pos_count, neg_count)
Exemple #37
0
 def score(self, potentials, parts, batch_dims=[0]):
     score = torch.mul(potentials, parts)
     batch = tuple((score.shape[b] for b in batch_dims))
     return self.semiring.prod(score.view(batch + (-1, )))
Exemple #38
0
 def _fusion_classif(self, x_v, x_q):
     x_mm = torch.mul(x_v, x_q)
     return x_mm
Exemple #39
0
    def _attention(self, input_v, x_q_vec):
        batch_size = input_v.size(0)
        width = input_v.size(2)
        height = input_v.size(3)

        # Process visual before fusion
        #x_v = input_v.view(batch_size*width*height, dim_features)
        x_v = input_v
        x_v = F.dropout(x_v,
                        p=self.opt['attention']['dropout_v'],
                        training=self.training)
        x_v = self.conv_v_att(x_v)
        if 'activation_v' in self.opt['attention']:
            x_v = getattr(F, self.opt['attention']['activation_v'])(x_v)
        x_v = x_v.view(batch_size, self.opt['attention']['dim_v'],
                       width * height)
        x_v = x_v.transpose(1, 2)

        # Process question before fusion
        x_q = F.dropout(x_q_vec,
                        p=self.opt['attention']['dropout_q'],
                        training=self.training)
        x_q = self.linear_q_att(x_q)
        if 'activation_q' in self.opt['attention']:
            x_q = getattr(F, self.opt['attention']['activation_q'])(x_q)
        x_q = x_q.view(batch_size, 1, self.opt['attention']['dim_q'])
        x_q = x_q.expand(batch_size, width * height,
                         self.opt['attention']['dim_q'])

        # First multimodal fusion
        x_att = self._fusion_att(x_v, x_q)

        if 'activation_mm' in self.opt['attention']:
            x_att = getattr(F, self.opt['attention']['activation_mm'])(x_att)

        # Process attention vectors
        x_att = F.dropout(x_att,
                          p=self.opt['attention']['dropout_mm'],
                          training=self.training)
        # can be optim to avoid two views and transposes
        x_att = x_att.view(batch_size, width, height,
                           self.opt['attention']['dim_mm'])
        x_att = x_att.transpose(2, 3).transpose(1, 2)
        x_att = self.conv_att(x_att)
        x_att = x_att.view(batch_size, self.opt['attention']['nb_glimpses'],
                           width * height)
        list_att_split = torch.split(x_att, 1, dim=1)
        list_att = []
        for x_att in list_att_split:
            x_att = x_att.contiguous()
            x_att = x_att.view(batch_size, width * height)
            x_att = F.softmax(x_att)
            list_att.append(x_att)

        # Apply attention vectors to input_v
        x_v = input_v.view(batch_size, self.opt['dim_v'], width * height)
        x_v = x_v.transpose(1, 2)

        list_v_att = []
        for i, x_att in enumerate(list_att):
            x_att = x_att.view(batch_size, width * height, 1)
            x_att = x_att.expand(batch_size, width * height, self.opt['dim_v'])
            x_v_att = torch.mul(x_att, x_v)
            x_v_att = x_v_att.sum(1)
            x_v_att = x_v_att.view(batch_size, self.opt['dim_v'])
            list_v_att.append(x_v_att)

        return list_v_att
Exemple #40
0
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, train_loader,
               writer):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, index, time, meta) in enumerate(val_loader):
        if cfg.NUM_GPUS:
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
            index = index.cuda()
            time = time.cuda()
        batch_size = (inputs[0][0].size(0)
                      if isinstance(inputs[0], list) else inputs[0].size(0))
        val_meter.data_toc()

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])
            ori_boxes = meta["ori_boxes"]
            metadata = meta["metadata"]

            if cfg.NUM_GPUS:
                preds = preds.cpu()
                ori_boxes = ori_boxes.cpu()
                metadata = metadata.cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds, ori_boxes, metadata)

        else:
            if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel":
                if not cfg.CONTRASTIVE.KNN_ON:
                    return
                train_labels = (model.module.train_labels if hasattr(
                    model, "module") else model.train_labels)
                yd, yi = model(inputs, index, time)
                K = yi.shape[1]
                C = (cfg.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM
                     )  # eg 400 for Kinetics400
                candidates = train_labels.view(1, -1).expand(batch_size, -1)
                retrieval = torch.gather(candidates, 1, yi)
                retrieval_one_hot = torch.zeros((batch_size * K, C)).cuda()
                retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1)
                yd_transform = yd.clone().div_(cfg.CONTRASTIVE.T).exp_()
                probs = torch.mul(
                    retrieval_one_hot.view(batch_size, -1, C),
                    yd_transform.view(batch_size, -1, 1),
                )
                preds = torch.sum(probs, 1)
            else:
                preds = model(inputs)

            if cfg.DATA.MULTI_LABEL:
                if cfg.NUM_GPUS > 1:
                    preds, labels = du.all_gather([preds, labels])
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    top1_err, top5_err = du.all_reduce([top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                top1_err, top5_err = top1_err.item(), top5_err.item()

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(
                    top1_err,
                    top5_err,
                    batch_size * max(
                        cfg.NUM_GPUS, 1
                    ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
                )
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err
                        },
                        global_step=len(val_loader) * cur_epoch + cur_iter,
                    )

            val_meter.update_predictions(preds, labels)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    # write to tensorboard format if available.
    if writer is not None:
        if cfg.DETECTION.ENABLE:
            writer.add_scalars({"Val/mAP": val_meter.full_map},
                               global_step=cur_epoch)
        else:
            all_preds = [pred.clone().detach() for pred in val_meter.all_preds]
            all_labels = [
                label.clone().detach() for label in val_meter.all_labels
            ]
            if cfg.NUM_GPUS:
                all_preds = [pred.cpu() for pred in all_preds]
                all_labels = [label.cpu() for label in all_labels]
            writer.plot_eval(preds=all_preds,
                             labels=all_labels,
                             global_step=cur_epoch)

    val_meter.reset()
    def test_precedence_semantics(self):
        """Test semantics for __torch_function__ for functions that take
        multiple arguments

        For functions that take multiple arguments, the appropriate
        __torch_function__ implementation to call is determined by
        examining the types of the arguments. The precedence order is
        left-to-right in the argument list, except subclasses are always
        checked before superclasses. The first result of calling the
        implementations in precedence order that is not NotImplemented
        is returned to the user. If all implementations return
        NotImplemented, a TypeError is raised.

        All cases are tested with functions implemented in C++ and
        either foo or baz, which are python functions defined above that
        are instrumented to obey the same dispatch rules as the
        functions in torch.functional.
        """
        # DiagonalTensor has a valid override and SubDiagonal has an
        # override that returns NotImplemented so we should call the
        # DiagonalTensor implementation, returning -1
        t1 = DiagonalTensor(5, 2)
        t2 = SubDiagonalTensor(5, 2)
        self.assertEqual(torch.div(t1, t2), -1)
        self.assertEqual(torch.div(t2, t1), -1)
        self.assertEqual(foo(t1, t2), -1)
        self.assertEqual(foo(t2, t1), -1)

        # SubTensor has an implementation that returns NotImplemented as
        # well so it should behave exactly like SubDiagonalTensor in the
        # test above
        t3 = SubTensor([[1, 2], [1, 2]])
        self.assertEqual(torch.div(t1, t3), -1)
        self.assertEqual(torch.div(t3, t1), -1)
        self.assertEqual(foo(t1, t3), -1)
        self.assertEqual(foo(t3, t1), -1)

        # div between SubTensor and SubDiagonalTensor should raise
        # TypeError since both have an implementation that
        # explicitly returns NotImplemented
        with self.assertRaises(TypeError):
            torch.div(t2, t3)
        with self.assertRaises(TypeError):
            torch.div(t3, t2)
        with self.assertRaises(TypeError):
            foo(t2, t3)
        with self.assertRaises(TypeError):
            foo(t3, t2)

        # none of DiagonalTensor, SubdiagonalTensor, or SubTensor have a
        # mul or a baz implementation so all ops should raise TypeError
        with self.assertRaises(TypeError):
            torch.mul(t1, t1)
        with self.assertRaises(TypeError):
            torch.mul(t1, t2)
        with self.assertRaises(TypeError):
            torch.mul(t1, t3)
        with self.assertRaises(TypeError):
            torch.mul(t2, t1)
        with self.assertRaises(TypeError):
            torch.mul(t2, t2)
        with self.assertRaises(TypeError):
            torch.mul(t2, t3)
        with self.assertRaises(TypeError):
            torch.mul(t3, t1)
        with self.assertRaises(TypeError):
            torch.mul(t3, t2)
        with self.assertRaises(TypeError):
            torch.mul(t3, t3)
        with self.assertRaises(TypeError):
            baz(t1, t1)
        with self.assertRaises(TypeError):
            baz(t1, t2)
        with self.assertRaises(TypeError):
            baz(t1, t3)
        with self.assertRaises(TypeError):
            baz(t2, t1)
        with self.assertRaises(TypeError):
            baz(t2, t2)
        with self.assertRaises(TypeError):
            baz(t2, t3)
        with self.assertRaises(TypeError):
            baz(t3, t1)
        with self.assertRaises(TypeError):
            baz(t3, t2)
        with self.assertRaises(TypeError):
            baz(t3, t3)
def normalize_weights(network, eps=1e-3):
    """
    'Normalize' the weights of a network, so that for each hidden neuron, the
    norm of incoming weights to that neuron is sqrt(2), dividing the outputs
    of that neuron by the factor that the inputs were multiplied by. For a ReLU
    network, this operation preserves network functionality.
    network: a neural network. has to inherit from torch.nn.module. Currently
             probably has to be an MLP
    eps: a float that should be small relative to sqrt(2), to add stability.
    returns nothing: just modifies the network in-place
    """
    layers = get_weight_modules_from_live_net(network)
    for idx in range(len(layers) - 1):
        this_layer = layers[idx]
        next_layer = layers[idx + 1]
        assert 'fc_mod' in this_layer or 'conv_mod' in this_layer
        assert 'fc_mod' in next_layer or 'conv_mod' in next_layer
        inc_raw_weight_mod = (this_layer['fc_mod'] if 'fc_mod' in this_layer
                              else this_layer['conv_mod'])
        inc_raw_weights = inc_raw_weight_mod.weight
        inc_raw_bias = inc_raw_weight_mod.bias
        inc_weights_np = inc_raw_weights.detach().cpu().numpy()
        inc_biases_np = inc_raw_bias.detach().cpu().numpy()
        if 'bn_mod' in this_layer:
            bn_mod = this_layer['bn_mod']
            if hasattr(bn_mod, 'weight') and bn_mod.weight is not None:
                bn_weights_np = bn_mod.weight.detach().cpu().numpy()
                inc_weights_np = size_and_multiply_np(bn_weights_np,
                                                      inc_weights_np)
            inc_weights_np = size_sqrt_divide_np(bn_mod.running_var,
                                                 inc_weights_np)
        outgoing_weight_mod = (next_layer['fc_mod'] if 'fc_mod' in next_layer
                               else next_layer['conv_mod'])
        outgoing_weights = outgoing_weight_mod.weight
        num_neurons = inc_weights_np.shape[0]
        assert outgoing_weights.shape[1] % num_neurons == 0
        if 'fc_mod' in this_layer and 'fc_mod' in next_layer:
            assert outgoing_weights.shape[1] == num_neurons
        if 'conv_mod' in this_layer and 'conv_mod' in next_layer:
            assert outgoing_weights.shape[1] == num_neurons

        unsqueezed_bias = np.expand_dims(inc_biases_np, axis=1)
        flat_weights = inc_weights_np.reshape(inc_weights_np.shape[0], -1)
        all_inc_weights = np.concatenate((flat_weights, unsqueezed_bias),
                                         axis=1)
        scales = np.linalg.norm(all_inc_weights, axis=1)
        scales /= np.sqrt(2.)
        scales += eps
        scales = torch.from_numpy(scales)
        scales_rows = torch.unsqueeze(scales, 1)
        for i in range(2, len(inc_raw_weights.shape)):
            scales_rows = torch.unsqueeze(scales_rows, i)
        scales_mul = vector_stretch(scales, outgoing_weights.shape[1])
        for i in range(1, len(outgoing_weights.shape) - 1):
            scales_mul = torch.unsqueeze(scales_mul, i)

        incoming_weights_unpruned = True
        incoming_biases_unpruned = True
        outgoing_weights_unpruned = True
        for name, param in inc_raw_weight_mod.named_parameters():
            if name == 'weight_orig':
                param.data = torch.div(param, scales_rows)
                incoming_weights_unpruned = False
            if name == 'bias_orig':
                param.data = torch.div(param, scales)
                incoming_biases_unpruned = False
        for name, param in outgoing_weight_mod.named_parameters():
            if name == 'weight_orig':
                param.data = torch.mul(param, scales_mul)
                outgoing_weights_unpruned = False
        if incoming_weights_unpruned:
            inc_raw_weight_mod.weight.data = torch.div(inc_raw_weights,
                                                       scales_rows)
        if incoming_biases_unpruned:
            inc_raw_weight_mod.bias.data = torch.div(inc_raw_bias, scales)
        if outgoing_weights_unpruned:
            outgoing_weight_mod.weight.data = torch.mul(
                outgoing_weights, scales_mul)
Exemple #43
0
def run_test_row_parallel_linear(rank, model_parallel_size, filename,
                                 filename_rpc):
    dist_init(rank, model_parallel_size, filename, filename_rpc)

    mpu.initialize_model_parallel(model_parallel_size)
    if torch.distributed.get_rank() == 0:
        print(
            "> testing RowParallelLinear with model parallel size: {}".format(
                model_parallel_size))
    model_parallel_size = mpu.get_model_parallel_world_size()

    seed = 12345
    set_random_seed(seed)
    input_size_coeff = 13
    input_size = input_size_coeff * model_parallel_size
    output_size_coeff = 17
    output_size = output_size_coeff * model_parallel_size
    batch_size = 7

    # Network
    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
    linear_layer = layers.RowParallelLinear(
        input_size, output_size, keep_master_weight_for_test=True).cuda()
    loss_weight = torch.randn([batch_size, output_size]).cuda()
    # Forward
    input_ = identity_layer()
    output = linear_layer(input_)
    loss = torch.mul(output, loss_weight).sum()
    # Backward
    loss.backward()

    # Values.
    dLdY = loss_weight
    X = identity_layer.weight
    A = linear_layer.master_weight.cuda()
    dLdA = torch.matmul(dLdY.t(), X)
    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
    dLdX = torch.matmul(dLdY, A)

    rank = mpu.get_model_parallel_rank()
    my_dLdA = torch.split(dLdA, input_size_coeff,
                          dim=1)[rank].contiguous().clone()
    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
    torch.distributed.barrier()
    print("   error in dLdA on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-6

    error = dLdb.sub(linear_layer.bias.grad).abs().max()
    torch.distributed.barrier()
    print("   error in dLdb on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-6

    error = dLdX.sub(identity_layer.weight.grad).abs().max()
    torch.distributed.barrier()
    print("   error in dLdX on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-6

    # Reset groups
    mpu.destroy_model_parallel()

    torch.distributed.barrier()
    if torch.distributed.get_rank() == 0:
        print(" >> passed the test :-)")
Exemple #44
0
    def one_step_forward(self, input, state, for_out):
        """
        Given input and state, move one step forward in time.
        Also, compute the derivative with respect to this input if it is not for_out.
        :param input: input at time t
        :param state: state at time t-1
        :param for_out: training or testing
        :return: state at time t, gradient with respect to input
        """
        gate_inputs = torch.matmul(torch.cat([input, state], 1),
                                   self.gate_kernel)
        gate_inputs = gate_inputs + self.gate_bias

        value = F.sigmoid(gate_inputs)
        r, u = torch.chunk(value, chunks=2, dim=1)

        r_state = r * state

        candidate = torch.matmul(torch.cat([input, r_state], 1),
                                 self.candidate_kernel)
        candidate = candidate + self.candidate_bias

        c = self.activation(candidate)
        new_h = u * state + (1 - u) * c

        if for_out or not (self.derivative_needed):
            return new_h, None

        # Extract the ".data" for each of the variables so that further computations do not
        # affect the gradients of these variables.
        u = u.data
        c = c.data
        state = state.data
        value = value.data

        start = torch.ones([1, self.hidden_units])
        du = torch.mul(start, (state))
        # dstate = torch.mul(start, (u))
        du = du - torch.mul(start, (c))
        dc = torch.mul(start, (1 - u))  # 50 x 50
        if self.activation == F.leaky_relu:
            dcandidate = (
                dc * torch.where(candidate > 0, torch.ones_like(candidate),
                                 0.01 * torch.ones_like(candidate))
            )  # 50x50 * 1x50
        else:
            dcandidate = (dc * (1 - c**2))
        dinputs_rstate = torch.matmul(dcandidate,
                                      torch.transpose(self.candidate_kernel,
                                                      dim0=1,
                                                      dim1=0))  # 50 x 350
        dinputs = dinputs_rstate[:, :300]
        drstate = dinputs_rstate[:, 300:]
        # dstate = dstate + r * drstate
        dr = state * drstate
        dru = torch.cat([dr, du], dim=1)
        dgateinputs = dru * (value * (1 - value))
        dinputs_state = torch.matmul(
            dgateinputs, torch.transpose(self.gate_kernel, dim0=1, dim1=0))
        dinputs = dinputs + dinputs_state[:, :300]
        # dstate = dstate + dinputs_state[:, 300:]
        return new_h, dinputs
Exemple #45
0
def native_layer_norm_backward(
    grad_out: Tensor,
    input: Tensor,
    normalized_shape: List[int],
    mean: Tensor,
    rstd: Tensor,
    weight: Optional[Tensor],
    bias: Optional[Tensor],
    output_mask: List[bool],
) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
    input_shape = input.shape
    input_ndim = input.dim()

    axis = input_ndim - len(normalized_shape)
    inner_dims = input_shape[axis:]
    outer_dims = input_shape[:axis]
    inner_dim_indices = list(range(axis, input_ndim))
    outer_dim_indices = list(range(0, axis))

    N = 1
    for i in inner_dims:
        N *= i
    M = 1
    for i in outer_dims:
        M *= i
    if M <= 0 or N <= 0:
        return (
            input.new_zeros(input_shape),
            input.new_zeros(input_shape[axis:]),
            input.new_zeros(input_shape[axis:]),
        )

    mean_, rstd_ = recompute_mean_var(input,
                                      rstd,
                                      inner_dim_indices,
                                      keepdim=True)

    x_hat = (input - mean_) * rstd_
    if weight is not None:
        grad_x_hat = grad_out * weight
    else:
        grad_x_hat = grad_out
    a = grad_x_hat * N
    b = torch.sum(grad_x_hat, inner_dim_indices, True)
    c1 = torch.mul(grad_x_hat, x_hat)
    c2 = torch.sum(c1, inner_dim_indices, True)
    c3 = torch.mul(x_hat, c2)
    inner = a - b - c3

    if output_mask[0]:
        d_input: Optional[Tensor] = (rstd_ / N) * inner
    else:
        d_input = torch.zeros_like(
            input)  # should be None but doesn't work with vjp

    if output_mask[1] and weight is not None:
        if len(outer_dim_indices) > 0:
            d_weight: Optional[Tensor] = torch.sum(grad_out * x_hat,
                                                   outer_dim_indices, False)
        else:
            d_weight = grad_out * x_hat
    elif weight is not None:
        d_weight = torch.zeros_like(
            weight)  # should be None but doesn't work with vjp
    else:
        d_weight = torch.zeros(())  # should be None but doesn't work with vjp

    if output_mask[2] and bias is not None:
        if len(outer_dim_indices) > 0:
            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices,
                                                 False)
        else:
            d_bias = grad_out
    elif bias is not None:
        d_bias = torch.zeros_like(
            bias)  # should be None but doesn't work with vjp
    else:
        d_bias = torch.zeros(())  # should be None but doesn't work with vjp

    return (d_input, d_weight, d_bias)
Exemple #46
0
 def _fusion_att(self, x_v, x_q):
     x_att = torch.mul(x_v, x_q)
     return x_att
def _abs_square(x):
    return torch.mul(x, x)
Exemple #48
0
def main(args):
    """ Main translation function' """
    # Load arguments from checkpoint
    torch.manual_seed(args.seed)
    state_dict = torch.load(
        args.checkpoint_path,
        map_location=lambda s, l: default_restore_location(s, 'cpu'))
    args_loaded = argparse.Namespace(**{
        **vars(args),
        **vars(state_dict['args'])
    })
    args_loaded.data = args.data
    args = args_loaded
    utils.init_logging(args)

    # Load dictionaries
    src_dict = Dictionary.load(
        os.path.join(args.data, 'dict.{:s}'.format(args.source_lang)))
    logging.info('Loaded a source dictionary ({:s}) with {:d} words'.format(
        args.source_lang, len(src_dict)))
    tgt_dict = Dictionary.load(
        os.path.join(args.data, 'dict.{:s}'.format(args.target_lang)))
    logging.info('Loaded a target dictionary ({:s}) with {:d} words'.format(
        args.target_lang, len(tgt_dict)))

    # Load dataset
    test_dataset = Seq2SeqDataset(
        src_file=os.path.join(args.data, 'test.{:s}'.format(args.source_lang)),
        tgt_file=os.path.join(args.data, 'test.{:s}'.format(args.target_lang)),
        src_dict=src_dict,
        tgt_dict=tgt_dict)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              num_workers=1,
                                              collate_fn=test_dataset.collater,
                                              batch_sampler=BatchSampler(
                                                  test_dataset,
                                                  9999999,
                                                  args.batch_size,
                                                  1,
                                                  0,
                                                  shuffle=False,
                                                  seed=args.seed))
    # Build model and criterion
    model = models.build_model(args, src_dict, tgt_dict)
    if args.cuda:
        model = model.cuda()
    model.eval()
    model.load_state_dict(state_dict['model'])
    logging.info('Loaded a model from checkpoint {:s}'.format(
        args.checkpoint_path))
    progress_bar = tqdm(test_loader, desc='| Generation', leave=False)

    # Iterate over the test set
    all_hyps = {}
    for i, sample in enumerate(progress_bar):

        # Create a beam search object or every input sentence in batch
        batch_size = sample['src_tokens'].shape[0]
        searches = [
            BeamSearch(args.beam_size, args.max_len - 1, tgt_dict.unk_idx)
            for i in range(batch_size)
        ]

        with torch.no_grad():
            # Compute the encoder output
            encoder_out = model.encoder(sample['src_tokens'],
                                        sample['src_lengths'])
            #print("src_tokens:", type(sample['src_tokens']))
            outlen = len(tgt_dict.string(sample['src_tokens']))
            print("-words len:", outlen)
            #print("-size:", list(sample['src_tokens'].size()))
            #print("-content:", sample['src_tokens'])
            # __QUESTION 1: What is "go_slice" used for and what do its dimensions represent?
            go_slice = \
                torch.ones(sample['src_tokens'].shape[0], 1).fill_(tgt_dict.eos_idx).type_as(sample['src_tokens'])
            if args.cuda:
                go_slice = utils.move_to_cuda(go_slice)

            # Compute the decoder output at the first time step
            decoder_out, _ = model.decoder(go_slice, encoder_out)

            #print("Decoder_out:", type(decoder_out)) # <class 'torch.Tensor'>
            #print("-size:", list(decoder_out.size()))
            print("-content:", decoder_out)

            lp_y = 1 / (((5 + outlen**a) / ((5 + 1)**a)))
            decoder_out = torch.mul(decoder_out, lp_y)
            print("-normalized:", decoder_out)

            # __QUESTION 2: Why do we keep one top candidate more than the beam size?
            log_probs, next_candidates = torch.topk(torch.log(
                torch.softmax(decoder_out, dim=2)),
                                                    args.beam_size + 1,
                                                    dim=-1)

        # Create number of beam_size beam search nodes for every input sentence
        for i in range(batch_size):
            for j in range(args.beam_size):
                best_candidate = next_candidates[i, :, j]
                backoff_candidate = next_candidates[i, :, j + 1]
                best_log_p = log_probs[i, :, j]
                backoff_log_p = log_probs[i, :, j + 1]
                next_word = torch.where(best_candidate == tgt_dict.unk_idx,
                                        backoff_candidate, best_candidate)
                log_p = torch.where(best_candidate == tgt_dict.unk_idx,
                                    backoff_log_p, best_log_p)
                log_p = log_p[-1]

                # Store the encoder_out information for the current input sentence and beam
                emb = encoder_out['src_embeddings'][:, i, :]
                lstm_out = encoder_out['src_out'][0][:, i, :]
                final_hidden = encoder_out['src_out'][1][:, i, :]
                final_cell = encoder_out['src_out'][2][:, i, :]
                try:
                    mask = encoder_out['src_mask'][i, :]
                except TypeError:
                    mask = None

                node = BeamSearchNode(searches[i], emb, lstm_out, final_hidden,
                                      final_cell, mask,
                                      torch.cat(
                                          (go_slice[i], next_word)), log_p, 1)
                # __QUESTION 3: Why do we add the node with a negative score?
                searches[i].add(-node.eval(), node)

        # Start generating further tokens until max sentence length reached
        for _ in range(args.max_len - 1):

            # Get the current nodes to expand
            nodes = [n[1] for s in searches for n in s.get_current_beams()]
            if nodes == []:
                break  # All beams ended in EOS

            # Reconstruct prev_words, encoder_out from current beam search nodes
            prev_words = torch.stack([node.sequence for node in nodes])
            encoder_out["src_embeddings"] = torch.stack(
                [node.emb for node in nodes], dim=1)
            lstm_out = torch.stack([node.lstm_out for node in nodes], dim=1)
            final_hidden = torch.stack([node.final_hidden for node in nodes],
                                       dim=1)
            final_cell = torch.stack([node.final_cell for node in nodes],
                                     dim=1)
            encoder_out["src_out"] = (lstm_out, final_hidden, final_cell)
            try:
                encoder_out["src_mask"] = torch.stack(
                    [node.mask for node in nodes], dim=0)
            except TypeError:
                encoder_out["src_mask"] = None

            with torch.no_grad():
                # Compute the decoder output by feeding it the decoded sentence prefix
                decoder_out, _ = model.decoder(prev_words, encoder_out)

            # see __QUESTION 2
            log_probs, next_candidates = torch.topk(torch.log(
                torch.softmax(decoder_out, dim=2)),
                                                    args.beam_size + 1,
                                                    dim=-1)

            # Create number of beam_size next nodes for every current node
            for i in range(log_probs.shape[0]):
                for j in range(args.beam_size):

                    best_candidate = next_candidates[i, :, j]
                    backoff_candidate = next_candidates[i, :, j + 1]
                    best_log_p = log_probs[i, :, j]
                    backoff_log_p = log_probs[i, :, j + 1]
                    next_word = torch.where(best_candidate == tgt_dict.unk_idx,
                                            backoff_candidate, best_candidate)
                    log_p = torch.where(best_candidate == tgt_dict.unk_idx,
                                        backoff_log_p, best_log_p)
                    log_p = log_p[-1]
                    next_word = torch.cat((prev_words[i][1:], next_word[-1:]))

                    # Get parent node and beam search object for corresponding sentence
                    node = nodes[i]
                    search = node.search

                    # __QUESTION 4: How are "add" and "add_final" different? What would happen if we did not make this distinction?

                    # Store the node as final if EOS is generated
                    if next_word[-1] == tgt_dict.eos_idx:
                        node = BeamSearchNode(
                            search, node.emb, node.lstm_out, node.final_hidden,
                            node.final_cell, node.mask,
                            torch.cat((prev_words[i][0].view([1]), next_word)),
                            node.logp, node.length)
                        search.add_final(-node.eval(), node)

                    # Add the node to current nodes for next iteration
                    else:
                        node = BeamSearchNode(
                            search, node.emb, node.lstm_out, node.final_hidden,
                            node.final_cell, node.mask,
                            torch.cat((prev_words[i][0].view([1]), next_word)),
                            node.logp + log_p, node.length + 1)
                        search.add(-node.eval(), node)

            # __QUESTION 5: What happens internally when we prune our beams?
            # How do we know we always maintain the best sequences?
            for search in searches:
                search.prune()

        # Segment into sentences
        best_sents = torch.stack(
            [search.get_best()[1].sequence[1:].cpu() for search in searches])
        decoded_batch = best_sents.numpy()

        output_sentences = [
            decoded_batch[row, :] for row in range(decoded_batch.shape[0])
        ]

        # __QUESTION 6: What is the purpose of this for loop?
        temp = list()
        for sent in output_sentences:
            first_eos = np.where(sent == tgt_dict.eos_idx)[0]
            if len(first_eos) > 0:
                temp.append(sent[:first_eos[0]])
            else:
                temp.append(sent)
        output_sentences = temp

        # Convert arrays of indices into strings of words
        output_sentences = [tgt_dict.string(sent) for sent in output_sentences]

        for ii, sent in enumerate(output_sentences):
            all_hyps[int(sample['id'].data[ii])] = sent

    # Write to file
    if args.output is not None:
        with open(args.output, 'w') as out_file:
            for sent_id in range(len(all_hyps.keys())):
                out_file.write(all_hyps[sent_id] + '\n')
Exemple #49
0
def train_dqn(model, options, resume):
    """Train DQN

       model -- DQN model
       lr -- learning rate
       max_episode -- maximum episode
       resume -- resume previous model
       model_name -- checkpoint file name
    """
    best_time_step = 0.
    if resume:
        if options.weight is None:
            print('when resume, you should give weight file name.')
            return
        print('load previous model weight: {}'.format(options.weight))
        _, _, best_time_step = load_checkpoint(options.weight, model)

    flappyBird = game.GameState()
    optimizer = optim.RMSprop(model.parameters(), lr=options.lr)
    ceriterion = nn.MSELoss()

    action = [1, 0]
    o, r, terminal = flappyBird.frame_step(action)
    o = preprocess(o)
    model.set_initial_state()

    if options.cuda:
        model = model.cuda()
    # in the first `OBSERVE` time steos, we dont train the model
    for i in range(options.observation):
        action = model.get_action_randomly()
        o, r, terminal = flappyBird.frame_step(action)
        o = preprocess(o)
        model.store_transition(o, action, r, terminal)
    # start training
    for episode in range(options.max_episode):
        model.time_step = 0
        model.set_train()
        total_reward = 0.
        # begin an episode!
        while True:
            optimizer.zero_grad()
            action = model.get_action()
            o_next, r, terminal = flappyBird.frame_step(action)
            total_reward += options.gamma**model.time_step * r
            o_next = preprocess(o_next)
            model.store_transition(o_next, action, r, terminal)
            model.increase_time_step()
            # Step 1: obtain random minibatch from replay memory
            minibatch = random.sample(model.replay_memory, options.batch_size)
            state_batch = np.array([data[0] for data in minibatch])
            action_batch = np.array([data[1] for data in minibatch])
            reward_batch = np.array([data[2] for data in minibatch])
            next_state_batch = np.array([data[3] for data in minibatch])
            state_batch_var = Variable(torch.from_numpy(state_batch))
            next_state_batch_var = Variable(torch.from_numpy(next_state_batch),
                                           volatile=True)
            if options.cuda:
                state_batch_var = state_batch_var.cuda()
                next_state_batch_var = next_state_batch_var.cuda()
            # Step 2: calculate y
            q_value_next = model.forward(next_state_batch_var)

            q_value = model.forward(state_batch_var)

            y = reward_batch.astype(np.float32)
            max_q, _ = torch.max(q_value_next, dim=1)

            for i in range(options.batch_size):
                if not minibatch[i][4]:
                    y[i] += options.gamma*max_q.data[i]


            y = Variable(torch.from_numpy(y))
            action_batch_var = Variable(torch.from_numpy(action_batch))
            if options.cuda:
                y = y.cuda()
                action_batch_var = action_batch_var.cuda()
            q_value = torch.sum(torch.mul(action_batch_var, q_value), dim=1)

            loss = ceriterion(q_value, y)
            loss.backward()

            optimizer.step()
            # when the bird dies, the episode ends
            if terminal:
                break

        print('episode: {}, epsilon: {:.4f}, max time step: {}, total reward: {:.6f}'.format(
                episode, model.epsilon, model.time_step, total_reward))

        if model.epsilon > options.final_e:
            delta = (options.init_e - options.final_e)/options.exploration
            model.epsilon -= delta

        if episode % 100 == 0:
            ave_time = test_dqn(model, episode)

        if ave_time > best_time_step:
            best_time_step = ave_time
            save_checkpoint({
                'episode': episode,
                'epsilon': model.epsilon,
                'state_dict': model.state_dict(),
                'best_time_step': best_time_step,
                 }, True, 'checkpoint-episode-%d.pth.tar' %episode)
        elif episode % options.save_checkpoint_freq == 0:
            save_checkpoint({
                'episode:': episode,
                'epsilon': model.epsilon,
                'state_dict': model.state_dict(),
                'time_step': ave_time,
                 }, False, 'checkpoint-episode-%d.pth.tar' %episode)
        else:
            continue
        print('save checkpoint, episode={}, ave time step={:.2f}'.format(
                 episode, ave_time))
Exemple #50
0
def run_test_parallel_embedding(rank, model_parallel_size, filename,
                                filename_rpc):
    dist_init(rank, model_parallel_size, filename, filename_rpc)

    if torch.distributed.get_rank() == 0:
        print("> testing parallel embedding with model parallel size {} ...".
              format(model_parallel_size))

    mpu.initialize_model_parallel(model_parallel_size)
    model_parallel_size = mpu.get_model_parallel_world_size()

    batch_size = 17
    seq_length = 23
    vocab_size = 48
    hidden_size = 16
    seed = 1236

    set_random_seed(123)
    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
        0, vocab_size).cuda()
    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()

    set_random_seed(seed)
    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()

    output = embedding_original(input_data)
    loss_original = torch.mul(output, loss_weight).sum()
    loss_original.backward()

    set_random_seed(seed)
    embedding_parallel = layers.ParallelEmbedding(
        vocab_size, hidden_size, init_method=init.normal_).cuda()
    output = embedding_parallel(input_data)
    loss_parallel = torch.mul(output, loss_weight).sum()
    loss_parallel.backward()

    set_random_seed(seed)
    embedding_vocab_parallel = layers.VocabParallelEmbedding(
        vocab_size, hidden_size, init_method=init.normal_).cuda()
    output = embedding_vocab_parallel(input_data)
    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
    loss_vocab_parallel.backward()

    torch.distributed.barrier()
    error = loss_parallel.sub(loss_original).abs()
    print("   error in loss (parallel) on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-12, "error: {}".format(error)

    torch.distributed.barrier()
    error = loss_vocab_parallel.sub(loss_original).abs()
    print("   error in loss (vocab parallel) on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-12, "error: {}".format(error)

    weight_grad_orig = torch.split(embedding_original.weight.grad,
                                   hidden_size // model_parallel_size,
                                   1)[mpu.get_model_parallel_rank()]
    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
    print("   error in grad (parallel) on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-12, "error: {}".format(error)

    weight_grad_orig = torch.split(embedding_original.weight.grad,
                                   vocab_size // model_parallel_size,
                                   0)[mpu.get_model_parallel_rank()]
    error = embedding_vocab_parallel.weight.grad.sub(
        weight_grad_orig).abs().max()
    print("   error in grad (vocab parallel) on global rank {}: {}".format(
        torch.distributed.get_rank(), error))
    assert error < 1.0e-12, "error: {}".format(error)

    # Reset groups
    mpu.destroy_model_parallel()

    torch.distributed.barrier()
    if torch.distributed.get_rank() == 0:
        print(">> passed the test :-)")
def mean_squared_error(prediction, target):

    prediction, target = flatten(prediction), flatten(target)
    diff = prediction - target

    return -torch.sum(torch.mul(diff, diff), 1)
Exemple #52
0
    def update_model(self) -> Tuple[torch.Tensor, ...]:
        """Train the model after each episode."""
        self.update_step += 1

        experiences, demos = self.memory.sample(), self.demo_memory.sample()

        states, actions, rewards, next_states, dones = experiences
        demo_states, demo_actions, _, _, _ = demos
        new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states)
        pred_actions, _, _, _, _ = self.actor(demo_states)

        # train alpha
        if self.hyper_params["AUTO_ENTROPY_TUNING"]:
            alpha_loss = (-self.log_alpha *
                          (log_prob + self.target_entropy).detach()).mean()

            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()

            alpha = self.log_alpha.exp()
        else:
            alpha_loss = torch.zeros(1)
            alpha = self.hyper_params["W_ENTROPY"]

        # Q function loss
        masks = 1 - dones
        q_1_pred = self.qf_1(states, actions)
        q_2_pred = self.qf_2(states, actions)
        v_target = self.vf_target(next_states)
        q_target = rewards + self.hyper_params["GAMMA"] * v_target * masks
        qf_1_loss = F.mse_loss(q_1_pred, q_target.detach())
        qf_2_loss = F.mse_loss(q_2_pred, q_target.detach())

        # V function loss
        v_pred = self.vf(states)
        q_pred = torch.min(self.qf_1(states, new_actions),
                           self.qf_2(states, new_actions))
        v_target = q_pred - alpha * log_prob
        vf_loss = F.mse_loss(v_pred, v_target.detach())

        # train Q functions
        self.qf_1_optimizer.zero_grad()
        qf_1_loss.backward()
        self.qf_1_optimizer.step()

        self.qf_2_optimizer.zero_grad()
        qf_2_loss.backward()
        self.qf_2_optimizer.step()

        # train V function
        self.vf_optimizer.zero_grad()
        vf_loss.backward()
        self.vf_optimizer.step()

        if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0:
            # bc loss
            qf_mask = torch.gt(
                self.qf_1(demo_states, demo_actions),
                self.qf_1(demo_states, pred_actions),
            ).to(device)
            qf_mask = qf_mask.float()
            n_qf_mask = int(qf_mask.sum().item())

            if n_qf_mask == 0:
                bc_loss = torch.zeros(1, device=device)
            else:
                bc_loss = (torch.mul(pred_actions, qf_mask) - torch.mul(
                    demo_actions, qf_mask)).pow(2).sum() / n_qf_mask

            # actor loss
            advantage = q_pred - v_pred.detach()
            actor_loss = (alpha * log_prob - advantage).mean()
            actor_loss = self.lambda1 * actor_loss + self.lambda2 * bc_loss

            # regularization
            if not self.is_discrete:  # iff the action is continuous
                mean_reg = self.hyper_params["W_MEAN_REG"] * mu.pow(2).mean()
                std_reg = self.hyper_params["W_STD_REG"] * std.pow(2).mean()
                pre_activation_reg = self.hyper_params[
                    "W_PRE_ACTIVATION_REG"] * (pre_tanh_value.pow(2).sum(
                        dim=-1).mean())
                actor_reg = mean_reg + std_reg + pre_activation_reg

                # actor loss + regularization
                actor_loss += actor_reg

            # train actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            common_utils.soft_update(self.vf, self.vf_target,
                                     self.hyper_params["TAU"])
        else:
            actor_loss = torch.zeros(1)
            n_qf_mask = 0

        return (
            actor_loss.item(),
            qf_1_loss.item(),
            qf_2_loss.item(),
            vf_loss.item(),
            alpha_loss.item(),
            n_qf_mask,
        )
Exemple #53
0
def native_batch_norm_backward(
    grad_out: Tensor,
    input: Tensor,
    weight: Optional[Tensor],
    running_mean: Optional[Tensor],
    running_var: Optional[Tensor],
    save_mean: Optional[Tensor],
    save_invstd: Optional[Tensor],
    train: bool,
    eps: float,
    output_mask: List[bool],
) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
    input_shape = input.shape
    input_rank = input.dim()
    assert input_rank >= 2, "rank of the input must be at least 2"

    axis = 1
    num_features = prod(input_shape) / input_shape[axis]
    mean = save_mean
    invstd = save_invstd
    if train:
        assert save_mean is not None and save_invstd is not None, "when train=True, save_mean and save_invstd are required"

        reduciton_dims = [0] + list(range(2, input.dim()))
        assert invstd is not None  # for typing
        mean, invstd = recompute_mean_var(input,
                                          invstd,
                                          reduciton_dims,
                                          keepdim=False)
    else:
        assert running_mean is not None and running_var is not None
        mean = running_mean
        invstd = torch.rsqrt(running_var + eps)

    broadcast_mask = [1] * input_rank
    broadcast_mask[axis] = input_shape[axis]

    reduction_axes: List[int] = []
    for i in range(input_rank):
        if i != axis:
            reduction_axes.append(i)

    mean = torch.reshape(mean, broadcast_mask)
    norm = 1.0 / num_features
    grad_output_sum = torch.sum(grad_out, reduction_axes)
    dot_p = torch.sum(grad_out * (input - mean), reduction_axes)

    grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask)
    proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd),
                               broadcast_mask)

    if weight is None:
        grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0
    else:
        grad_scale = torch.reshape(invstd * weight, broadcast_mask)

    if train:
        proj = (input - mean) * proj_scale
        grad_input = ((grad_out - proj) - grad_mean) * grad_scale
    else:
        grad_input = grad_out * grad_scale

    if output_mask[1]:
        grad_weight = dot_p * invstd
    elif weight is not None:
        grad_weight = torch.zeros_like(
            weight)  # should be None but doesn't work with vjp
    else:
        grad_weight = torch.zeros(
            ())  # should be None but doesn't work with vjp

    if output_mask[2]:
        grad_bias = grad_output_sum
    else:
        grad_bias = torch.zeros_like(
            grad_output_sum)  # should be None but doesn't work with vjp

    return (grad_input, grad_weight, grad_bias)