def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
        ctx = mp.get_context('spawn')
        tensors = []
        for i in range(5):
            device = i % 2
            tensors += [torch.arange(i * 5, (i + 1) * 5).cuda(device)]

        inq = ctx.Queue()
        outq = ctx.Queue()
        inq.put(tensors)
        p = ctx.Process(target=sum_tensors, args=(inq, outq))
        p.start()

        results = []
        for i in range(5):
            results.append(outq.get())
        p.join()

        for i, tensor in enumerate(tensors):
            v, device, tensor_size, storage_size = results[i]
            self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
            self.assertEqual(device, i % 2)
            self.assertEqual(tensor_size, 5)
            self.assertEqual(storage_size, 5)
Beispiel #2
0
    def __call__(self, grid):
        batch_size, _, grid_dimX, grid_dimY, grid_dimZ = grid.size()

        k = 1.0

        x_coords = 2.0 * k * torch.arange(grid_dimX, dtype=torch.float32).unsqueeze(1).unsqueeze(1
            ).expand(grid_dimX, grid_dimY, grid_dimZ) / (grid_dimX - 1.0) - 1.0
        y_coords = 2.0 * k * torch.arange(grid_dimY, dtype=torch.float32).unsqueeze(1).unsqueeze(0
            ).expand(grid_dimX, grid_dimY, grid_dimZ) / (grid_dimY - 1.0) - 1.0
        z_coords = 2.0 * k * torch.arange(grid_dimZ, dtype=torch.float32).unsqueeze(0).unsqueeze(0
            ).expand(grid_dimX, grid_dimY, grid_dimZ) / (grid_dimZ - 1.0) - 1.0

        coords = torch.stack((x_coords, y_coords, z_coords), dim=0)

        if self.with_r:
            rs = ((x_coords ** 2) + (y_coords ** 2) + (z_coords ** 2)) ** 0.5
            rs = k * rs / torch.max(rs)
            rs = torch.unsqueeze(rs, dim=0)
            coords = torch.cat((coords, rs), dim=0)

        coords = torch.unsqueeze(coords, dim=0).repeat(batch_size, 1, 1, 1, 1)

        grid = torch.cat((coords.to(grid.device), grid), dim=1)

        return grid
 def test_broadcast_subspace(self):
     a = zeros((100, 100))
     v = Variable(torch.arange(0, 100))[:, None]
     b = Variable(torch.arange(99, -1, -1).long())
     a[b] = v
     expected = b.double().unsqueeze(1).expand(100, 100)
     self.assertEqual(a, expected)
Beispiel #4
0
    def __call__(self, spec_f):

        spec_f, is_variable = _check_is_variable(spec_f)
        n_fft = spec_f.size(2)

        m_min = 0. if self.f_min == 0 else 2595 * np.log10(1. + (self.f_min / 700))
        m_max = 2595 * np.log10(1. + (self.f_max / 700))

        m_pts = torch.linspace(m_min, m_max, self.n_mels + 2)
        f_pts = (700 * (10**(m_pts / 2595) - 1))

        bins = torch.floor(((n_fft - 1) * 2) * f_pts / self.sr).long()

        fb = torch.zeros(n_fft, self.n_mels)
        for m in range(1, self.n_mels + 1):
            f_m_minus = bins[m - 1].item()
            f_m = bins[m].item()
            f_m_plus = bins[m + 1].item()

            if f_m_minus != f_m:
                fb[f_m_minus:f_m, m - 1] = (torch.arange(f_m_minus, f_m) - f_m_minus) / (f_m - f_m_minus)
            if f_m != f_m_plus:
                fb[f_m:f_m_plus, m - 1] = (f_m_plus - torch.arange(f_m, f_m_plus)) / (f_m_plus - f_m)

        fb = Variable(fb)
        spec_m = torch.matmul(spec_f, fb)  # (c, l, n_fft) dot (n_fft, n_mels) -> (c, l, n_mels)
        return spec_m if is_variable else spec_m.data
Beispiel #5
0
def meshgrid(x, y, row_major=True):
    '''Return meshgrid in range x & y.

    Args:
      x: (int) first dim range.
      y: (int) second dim range.
      row_major: (bool) row major or column major.

    Returns:
      (tensor) meshgrid, sized [x*y,2]

    Example:
    >> meshgrid(3,2)
    0  0
    1  0
    2  0
    0  1
    1  1
    2  1
    [torch.FloatTensor of size 6x2]

    >> meshgrid(3,2,row_major=False)
    0  0
    0  1
    0  2
    1  0
    1  1
    1  2
    [torch.FloatTensor of size 6x2]
    '''
    a = torch.arange(0,x)
    b = torch.arange(0,y)
    xx = a.repeat(y).view(-1,1)
    yy = b.view(-1,1).repeat(1,x).view(-1,1)
    return torch.cat([xx,yy],1) if row_major else torch.cat([yy,xx],1)
Beispiel #6
0
def make_positions(tensor, padding_idx, left_pad, onnx_trace=False):
    """Replace non-padding symbols with their position numbers.

    Position numbers begin at padding_idx+1.

    Padding symbols are ignored, but it is necessary to specify whether padding
    is added on the left side (left_pad=True) or right side (left_pad=False).
    """
    if onnx_trace:
        range_buf = torch._dim_arange(like=tensor, dim=1) + padding_idx + 1
        mask = tensor.ne(padding_idx)
        positions = range_buf.expand_as(tensor)
        if left_pad:
            positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
        return positions * mask.long() + positions * (1 - mask.long())

    max_pos = padding_idx + 1 + tensor.size(1)
    if not hasattr(make_positions, 'range_buf'):
        make_positions.range_buf = tensor.new()
    make_positions.range_buf = make_positions.range_buf.type_as(tensor)
    if make_positions.range_buf.numel() < max_pos:
        torch.arange(padding_idx + 1, max_pos, out=make_positions.range_buf)
    mask = tensor.ne(padding_idx)
    positions = make_positions.range_buf[:tensor.size(1)].expand_as(tensor)
    if left_pad:
        positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
    return tensor.clone().masked_scatter_(mask, positions[mask])
    def test_int_assignment(self):
        x = Variable(torch.arange(0, 4).view(2, 2))
        x[1] = 5
        self.assertEqual(x.data.tolist(), [[0, 1], [5, 5]])

        x = Variable(torch.arange(0, 4).view(2, 2))
        x[1] = Variable(torch.arange(5, 7))
        self.assertEqual(x.data.tolist(), [[0, 1], [5, 6]])
Beispiel #8
0
    def test_int_assignment(self):
        x = torch.arange(0, 4).view(2, 2)
        x[1] = 5
        self.assertEqual(x.tolist(), [[0, 1], [5, 5]])

        x = torch.arange(0, 4).view(2, 2)
        x[1] = torch.arange(5, 7)
        self.assertEqual(x.tolist(), [[0, 1], [5, 6]])
 def test_byte_tensor_assignment(self):
     x = Variable(torch.arange(0, 16).view(4, 4))
     b = Variable(torch.ByteTensor([True, False, True, False]))
     value = Variable(torch.Tensor([3, 4, 5, 6]))
     x[b] = value
     self.assertEqual(x[0], value)
     self.assertEqual(x[1].data, torch.arange(4, 8))
     self.assertEqual(x[2], value)
     self.assertEqual(x[3].data, torch.arange(12, 16))
Beispiel #10
0
 def test_byte_tensor_assignment(self):
     x = torch.arange(0., 16).view(4, 4)
     b = torch.ByteTensor([True, False, True, False])
     value = torch.tensor([3., 4., 5., 6.])
     x[b] = value
     self.assertEqual(x[0], value)
     self.assertEqual(x[1], torch.arange(4, 8))
     self.assertEqual(x[2], value)
     self.assertEqual(x[3], torch.arange(12, 16))
Beispiel #11
0
 def enumerate_support(self):
     total_count = int(self.total_count.max())
     if not self.total_count.min() == total_count:
         raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.")
     values = self._new(1 + total_count,)
     torch.arange(1 + total_count, out=values)
     values = values.view((-1,) + (1,) * len(self._batch_shape))
     values = values.expand((-1,) + self._batch_shape)
     return values
 def __init__(self, train_size, batch_size):
     self.num_data = train_size
     self.num_per_batch = int(train_size / batch_size)
     self.batch_size = batch_size
     self.range = torch.arange(0,batch_size).view(1, batch_size).long()
     self.leftover_flag = False
     if train_size % batch_size:
         self.leftover = torch.arange(self.num_per_batch*batch_size, train_size).long()
         self.leftover_flag = True
Beispiel #13
0
 def backward(ctx, grad_output):
     idx = grad_output.data.new().long()
     torch.arange(0, ctx.input_numel, out=idx)
     idx = idx.view(ctx.input_size)
     idx_unfolded = idx.unfold(ctx.dim, ctx.size, ctx.step)
     idx_unfolded = idx_unfolded.contiguous().view(-1)
     grad_input = Variable(grad_output.data.new(ctx.input_numel).zero_())
     grad_output = grad_output.contiguous().view(-1)
     grad_input = grad_input.index_add(0, Variable(idx_unfolded), grad_output)
     return grad_input.view(ctx.input_size), None, None, None
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)
    def findLR(self, model, optimizer, writer,
               start_lr=1e-7, end_lr=10, num_iters=50):
        model.train()

        losses = []
        lrs = np.logspace(np.log10(start_lr), np.log10(end_lr), num_iters)

        for lr in lrs:
            # Update LR
            for group in optimizer.param_groups: group['lr'] = lr

            batch = next(iter(self.data_loaders[0]))
            input_images, depthGT, maskGT = utils.unpack_batch_fixed(batch, self.cfg.device)
            # ------ define ground truth------
            XGT, YGT = torch.meshgrid([torch.arange(self.cfg.outH), # [H,W]
                                       torch.arange(self.cfg.outW)]) # [H,W]
            XGT, YGT = XGT.float(), YGT.float()
            XYGT = torch.cat([
                XGT.repeat([self.cfg.outViewN, 1, 1]), 
                YGT.repeat([self.cfg.outViewN, 1, 1])], dim=0) #[2V,H,W]
            XYGT = XYGT.unsqueeze(dim=0).to(self.cfg.device) #[1,2V,H,W]

            with torch.set_grad_enabled(True):
                optimizer.zero_grad()

                XYZ, maskLogit = model(input_images)
                XY = XYZ[:, :self.cfg.outViewN * 2, :, :]
                depth = XYZ[:, self.cfg.outViewN * 2:self.cfg.outViewN * 3, :,  :]
                mask = (maskLogit > 0).byte()
                # ------ Compute loss ------
                loss_XYZ = self.l1(XY, XYGT)
                loss_XYZ += self.l1(depth.masked_select(mask),
                                    depthGT.masked_select(mask))
                loss_mask = self.sigmoid_bce(maskLogit, maskGT)
                loss = loss_mask + self.cfg.lambdaDepth * loss_XYZ

                # Update weights
                loss.backward()
                # True Weight decay
                if self.cfg.trueWD is not None:
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.data = param.data.add(
                                -self.cfg.trueWD * group['lr'], param.data)
                optimizer.step()

            losses.append(loss.item())

        fig, ax = plt.subplots()
        ax.plot(lrs, losses)
        ax.set_xlabel('learning rate')
        ax.set_ylabel('loss')
        ax.set_xscale('log')
        writer.add_figure('findLR', fig)
    def __call__(self, image):

        x_coords = 2.0 * torch.arange(self.image_height).unsqueeze(
            1).expand(self.image_height, self.image_width) / 255.0 - 1.0
        y_coords = 2.0 * torch.arange(self.image_width).unsqueeze(
            0).expand(self.image_height, self.image_width) / 255.0 - 1.0
        coords = torch.stack((x_coords, y_coords), dim=0)

        image = torch.cat((coords, image), dim=0)

        return image
    def __init__(self, input_dim: int, max_len: int = 5000) -> None:
        super().__init__()

        # Compute the positional encodings once in log space.
        positional_encoding = torch.zeros(max_len, input_dim, requires_grad=False)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, input_dim, 2).float() * -(math.log(10000.0) / input_dim))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer('positional_encoding', positional_encoding)
Beispiel #18
0
 def __init__(self, roi_size = 128, n_segments = 49):
     super().__init__()
     
     self.roi_size = roi_size
     self.n_segments = n_segments
     
     X_grid = torch.arange(0, roi_size).view(1, -1).expand(1, 1, roi_size, roi_size)
     Y_grid = torch.arange(0, roi_size).view(-1, 1).expand(1, 1, roi_size, roi_size)
     
     self.X_grid = nn.Parameter(X_grid.contiguous(), requires_grad=False)
     self.Y_grid = nn.Parameter(Y_grid.contiguous(), requires_grad=False)
Beispiel #19
0
def get_subtree(tree, actions, batch_size, num_actions):
    # gets the subtree corresponding to actions taken
    action_indices = actions[:,0]
    output = []
    for i, x in enumerate(tree[1:]):
        batch_starts = cudify(torch.arange(0, batch_size) * x.size(0) / batch_size)
        indices = []
        for b in range(batch_size):
            indices.append(cudify(torch.arange(action_indices[b] * num_actions**i, (action_indices[b]+1) * num_actions**i)) + batch_starts[b])
        indices = torch.cat(indices).long()
        output.append(x[indices])
    return output
Beispiel #20
0
 def __init__(self, dropout, dim, max_len=5000):
     pe = torch.zeros(max_len, dim)
     position = torch.arange(0, max_len).unsqueeze(1)
     div_term = torch.exp(torch.arange(0, dim, 2) *
                          -(math.log(10000.0) / dim))
     pe[:, 0::2] = torch.sin(position * div_term)
     pe[:, 1::2] = torch.cos(position * div_term)
     pe = pe.unsqueeze(1)
     super(PositionalEncoding, self).__init__()
     self.register_buffer('pe', pe)
     self.dropout = nn.Dropout(p=dropout)
     self.dim = dim
Beispiel #21
0
def make_positions(tokens, padding_idx, left_pad, offset=0):
    seqlen = tokens.size(1)
    if not hasattr(make_positions, 'range'):
        make_positions.range = tokens.new()
    if make_positions.range.numel() < offset + seqlen:
        # offset positions by the padding index
        torch.arange(padding_idx + 1, padding_idx + 1 + offset + seqlen,
                     out=make_positions.range)
    mask = tokens.ne(padding_idx)
    positions = make_positions.range[offset:offset+seqlen].expand_as(tokens)
    if left_pad:
        positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
    return tokens.clone().masked_scatter_(mask, positions[mask])
Beispiel #22
0
 def setUp(self):
     self.v = Variable(torch.Tensor([3]))
     self.vs = Variable(torch.Tensor([[0], [1], [2], [3]]))
     self.vs_expanded = self.vs.expand(4, 3)
     self.test_data = Variable(torch.Tensor([[3], [3], [3]]))
     self.batch_test_data_1 = Variable(torch.arange(0, 4).unsqueeze(1).expand(4, 3))
     self.batch_test_data_2 = Variable(torch.arange(4, 8).unsqueeze(1).expand(4, 3))
     self.batch_test_data_3 = Variable(torch.Tensor([[3], [3], [3], [3]]))
     self.expected_support = [[0], [1], [2], [3]]
     self.expected_support_non_vec = [3]
     self.analytic_mean = 3
     self.analytic_var = 0
     self.n_samples = 10
Beispiel #23
0
    def __init__(self, beam_size, batch_size, pad, bos, eos, n_best, mb_device,
                 global_scorer, min_length, max_length, return_attention,
                 block_ngram_repeat, exclusion_tokens, memory_lengths,
                 stepwise_penalty, ratio):
        super(BeamSearch, self).__init__(
            pad, bos, eos, batch_size, mb_device, beam_size, min_length,
            block_ngram_repeat, exclusion_tokens, return_attention,
            max_length)
        # beam parameters
        self.global_scorer = global_scorer
        self.beam_size = beam_size
        self.n_best = n_best
        self.batch_size = batch_size
        self.ratio = ratio

        # result caching
        self.hypotheses = [[] for _ in range(batch_size)]

        # beam state
        self.top_beam_finished = torch.zeros([batch_size], dtype=torch.uint8)
        self.best_scores = torch.full([batch_size], -1e10, dtype=torch.float,
                                      device=mb_device)

        self._batch_offset = torch.arange(batch_size, dtype=torch.long)
        self._beam_offset = torch.arange(
            0, batch_size * beam_size, step=beam_size, dtype=torch.long,
            device=mb_device)
        self.topk_log_probs = torch.tensor(
            [0.0] + [float("-inf")] * (beam_size - 1), device=mb_device
        ).repeat(batch_size)
        self.select_indices = None
        self._memory_lengths = memory_lengths

        # buffers for the topk scores and 'backpointer'
        self.topk_scores = torch.empty((batch_size, beam_size),
                                       dtype=torch.float, device=mb_device)
        self.topk_ids = torch.empty((batch_size, beam_size), dtype=torch.long,
                                    device=mb_device)
        self._batch_index = torch.empty([batch_size, beam_size],
                                        dtype=torch.long, device=mb_device)
        self.done = False
        # "global state" of the old beam
        self._prev_penalty = None
        self._coverage = None

        self._stepwise_cov_pen = (
                stepwise_penalty and self.global_scorer.has_cov_pen)
        self._vanilla_cov_pen = (
            not stepwise_penalty and self.global_scorer.has_cov_pen)
        self._cov_pen = self.global_scorer.has_cov_pen
Beispiel #24
0
    def updateGradInput(self, input, gradOutput):
        input, mask = input
        if input.type() == 'torch.cuda.FloatTensor':
            torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size())
            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
        else:
            torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size())

        torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
        self._gradBuffer.resize_(input.nelement()).zero_()
        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
        self._gradBuffer.resize_(input.size())
        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
        return self.gradInput
Beispiel #25
0
def test_inference_whiten_vsgp():
    N = 1000
    X = dist.Uniform(torch.zeros(N), torch.ones(N)*5).sample()
    y = 0.5 * torch.sin(3*X) + dist.Normal(torch.zeros(N), torch.ones(N)*0.5).sample()
    kernel = RBF(input_dim=1)
    Xu = torch.arange(0, 5.5, 0.5)

    vsgp = VariationalSparseGP(X, y, kernel, Xu, Gaussian(), whiten=True)
    vsgp.optimize(optim.Adam({"lr": 0.01}), num_steps=1000)

    Xnew = torch.arange(0, 5.05, 0.05)
    loc, var = vsgp(Xnew, full_cov=False)
    target = 0.5 * torch.sin(3*Xnew)

    assert_equal((loc - target).abs().mean().item(), 0, prec=0.07)
Beispiel #26
0
def test_inference_sgpr():
    N = 1000
    X = dist.Uniform(torch.zeros(N), torch.ones(N)*5).sample()
    y = 0.5 * torch.sin(3*X) + dist.Normal(torch.zeros(N), torch.ones(N)*0.5).sample()
    kernel = RBF(input_dim=1)
    Xu = torch.arange(0, 5.5, 0.5)

    sgpr = SparseGPRegression(X, y, kernel, Xu)
    sgpr.optimize(optim.Adam({"lr": 0.01}), num_steps=1000)

    Xnew = torch.arange(0, 5.05, 0.05)
    loc, var = sgpr(Xnew, full_cov=False)
    target = 0.5 * torch.sin(3*Xnew)

    assert_equal((loc - target).abs().mean().item(), 0, prec=0.07)
Beispiel #27
0
 def test_step(self):
     v = Variable(torch.arange(10))
     self.assertEqual(v[::1], v)
     self.assertEqual(v[::2].data.tolist(), [0, 2, 4, 6, 8])
     self.assertEqual(v[::3].data.tolist(), [0, 3, 6, 9])
     self.assertEqual(v[::11].data.tolist(), [0])
     self.assertEqual(v[1:6:2].data.tolist(), [1, 3, 5])
    def imgEncodeTorch(self, abimg):
        abimg = abimg.cuda()
        w, h = abimg.shape[1], abimg.shape[2]
        label = torch.zeros((w*h, 313))
        label = label.cuda()

        (dists, indexes) = self.nbrs.kneighbors(
            abimg.view(abimg.shape[0], -1).t(), self.NN)
        dists = torch.from_numpy(dists).float().cuda()
        indexes = torch.from_numpy(indexes).cuda()

        weights = torch.exp(-dists**2/(2*self.sigma**2)).cuda()
        weights = weights/torch.sum(weights, dim=1).view(-1, 1)

        pixel_indexes = torch.Tensor.long(torch.arange(
            start=0, end=abimg.shape[1]*abimg.shape[2])[:, np.newaxis])
        pixel_indexes = pixel_indexes.cuda()
        label[pixel_indexes, indexes] = weights
        label = label.t().contiguous().view(313, w, h)

        rebal_indexes = indexes[:, 0]
        rebal_weights = self.weights[rebal_indexes]
        rebal_weights = rebal_weights.view(w, h)
        rebal_label = rebal_weights * label

        return rebal_label
Beispiel #29
0
 def test_int_indices_broadcast(self):
     # From the NumPy indexing example
     x = Variable(torch.arange(0, 12).view(4, 3))
     rows = Variable(torch.LongTensor([0, 3]))
     columns = Variable(torch.LongTensor([0, 2]))
     result = x[rows[:, None], columns]
     self.assertEqual(result.data.tolist(), [[0, 2], [9, 11]])
Beispiel #30
0
    def forward(self, x, labels):
        """
        Args:
        - x: feature matrix with shape (batch_size, feat_dim).
        - labels: ground truth labels with shape (num_classes).
        """
        batch_size = x.size(0)
        distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \
                  torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t()
        distmat.addmm_(1, -2, x, self.centers.t())

        classes = torch.arange(self.num_classes).long()
        if self.use_gpu: classes = classes.cuda()
        labels = labels.unsqueeze(1).expand(batch_size, self.num_classes)
        mask = labels.eq(classes.expand(batch_size, self.num_classes))

        dist = []
        for i in range(batch_size):
            value = distmat[i][mask[i]]
            value = value.clamp(min=1e-12, max=1e+12) # for numerical stability
            dist.append(value)
        dist = torch.cat(dist)
        loss = dist.mean()

        return loss
Beispiel #31
0
    def forward(self, fpn_fms, rcnn_rois, labels=None, bbox_targets=None, eval = False, flip_fms = None, extra = {}):
        # input p2-p5
        pred_emd_pred_cls_0, pred_emd_pred_delta_0, pred_emd_pred_cls_1, pred_emd_pred_delta_1,\
            pred_ref_pred_cls_0, pred_ref_pred_delta_0, pred_ref_pred_cls_1, pred_ref_pred_delta_1,\
                pool_features,refine_features = self._half_forward(fpn_fms, rcnn_rois, keep_pool_feature = True)
        

        if self.training or eval:
            loss0 = emd_loss(
                        pred_emd_pred_delta_0, pred_emd_pred_cls_0,
                        pred_emd_pred_delta_1, pred_emd_pred_cls_1,
                        bbox_targets, labels)
            loss1 = emd_loss(
                        pred_emd_pred_delta_1, pred_emd_pred_cls_1,
                        pred_emd_pred_delta_0, pred_emd_pred_cls_0,
                        bbox_targets, labels)

            loss2 = emd_loss(
                        pred_ref_pred_delta_0, pred_ref_pred_cls_0,
                        pred_ref_pred_delta_1, pred_ref_pred_cls_1,
                        bbox_targets, labels)
            loss3 = emd_loss(
                        pred_ref_pred_delta_1, pred_ref_pred_cls_1,
                        pred_ref_pred_delta_0, pred_ref_pred_cls_0,
                        bbox_targets, labels)
                        
            loss_rcnn = torch.cat([loss0, loss1], axis=1)
            loss_ref = torch.cat([loss2, loss3], axis=1)
            with torch.no_grad():
                _, min_indices_rcnn = loss_rcnn.min(axis=1)
                _, min_indices_ref = loss_ref.min(axis=1)
            loss_rcnn = loss_rcnn[torch.arange(loss_rcnn.shape[0]), min_indices_rcnn]
            loss_rcnn = loss_rcnn.sum()/loss_rcnn.shape[0]
            loss_ref = loss_ref[torch.arange(loss_ref.shape[0]), min_indices_ref]
            loss_ref = loss_ref.sum()/loss_ref.shape[0]
            #loss, _ = loss.min(axis=1)
            #loss_emd = loss.sum()/loss.shape[0]
            
            loss_dict = {}
            loss_dict['loss_rcnn_emd'] = loss_rcnn
            loss_dict['loss_ref_emd'] = loss_ref

            if self.args.flip_JSD:
                if self.args.flip_JSD_0g:
                    with torch.no_grad():
                        _, _, _, _,\
                        f_pred_ref_pred_cls_0, _, \
                            f_pred_ref_pred_cls_1, _ = self._half_forward(flip_fms, rcnn_rois)
                else:
                    _, _, _, _,\
                        f_pred_ref_pred_cls_0, _, \
                            f_pred_ref_pred_cls_1, _ = self._half_forward(flip_fms, rcnn_rois)
                loss_flip_JSD = _flip_loss_JSD(F.softmax(pred_ref_pred_cls_0, dim=-1),F.softmax(f_pred_ref_pred_cls_0, dim=-1))
                loss_flip_JSD += _flip_loss_JSD(F.softmax(pred_ref_pred_cls_1, dim=-1),F.softmax(f_pred_ref_pred_cls_1, dim=-1))
                loss_dict['loss_flip_JSD'] = loss_flip_JSD 
         
            if self.args.diff_loss:   
                loss_dict['diff_loss'] =  _diff_loss(refine_features[0],refine_features[1])     


            return loss_dict
        else:
            pred_ref_scores_0 = F.softmax(pred_ref_pred_cls_0, dim=-1)
            pred_ref_scores_1 = F.softmax(pred_ref_pred_cls_1, dim=-1)

            pred_bbox_0 = restore_bbox(rcnn_rois[:, 1:5], pred_ref_pred_delta_0, True)
            pred_bbox_1 = restore_bbox(rcnn_rois[:, 1:5], pred_ref_pred_delta_1, True)
            pred_bbox_0 = torch.cat([pred_bbox_0, pred_ref_scores_0[:, 1].reshape(-1,1)], dim=1)
            pred_bbox_1 = torch.cat([pred_bbox_1, pred_ref_scores_1[:, 1].reshape(-1,1)], dim=1)
            pred_bbox = torch.cat((pred_bbox_0, pred_bbox_1), dim=1).reshape(-1,5)
            return pred_bbox
We will see what these mean in this tutorial.
"""

import torch
from torch import nn
import torch.nn.functional as F


## Tensors

# Tensors are the most basic data type in pytorch.
# They are very similar to numpy arrays in terms of the interface and supported functions.
# For example:
a = torch.tensor([1, 2, 3])  # numpy: a = np.array([1, 2, 3])
b = torch.arange(12).reshape(4, 3)  # numpy: b = np.arange(12).reshape((4, 3))
c = torch.full((2, 2), 7)  # numpy: c = np.full((2, 2), 7)
print(a + b)  # numpy: a + b; note the broadcasting here
print(b.sum(dim=1))  # numpy: b.sum(axis=1)
print(a.type(torch.float))  # equivalently a.float() or a.to(torch.float); numpy: a.astype(np.float)
print(b[1:3, 2])  # numpy: b[1:3, 2]

# pytorch tensors are more powerful than numpy arrays. For example, you can move them to GPUs for
# faster computation (e.g., matrix multiplication).
b.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# They also support gradient tracking, as we will see below.
# You may ask, why did we introduce numpy first then? First of all, numpy is still very widely used
# beyond automatic differentiation. Even for ML/NLP, it is often used for data loading, metric
# calculation, etc. Second, since pytorch tensors have a very similar interface as numpy arrays,
# understanding the latter makes it easy to learn the first.
Beispiel #33
0
    def forward(self, x, targets=None):
        nA = self.num_anchors
        nB = x.size(0)
        nG = x.size(2)
        stride = self.image_dim / nG

        # Tensors for cuda support
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

        prediction = x.view(nB, nA, self.bbox_attrs, nG,
                            nG).permute(0, 1, 3, 4, 2).contiguous()

        # Get outputs
        x = torch.sigmoid(prediction[..., 0])  # Center x
        y = torch.sigmoid(prediction[..., 1])  # Center y
        w = prediction[..., 2]  # Width
        h = prediction[..., 3]  # Height
        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

        # Calculate offsets for each grid
        grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG,
                                                      nG]).type(FloatTensor)
        grid_y = torch.arange(nG).repeat(nG,
                                         1).t().view([1, 1, nG,
                                                      nG]).type(FloatTensor)
        scaled_anchors = FloatTensor([(a_w / stride, a_h / stride)
                                      for a_w, a_h in self.anchors])
        anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1))
        anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1))

        # Add offset and scale with anchors
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + grid_x
        pred_boxes[..., 1] = y.data + grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h

        # Training
        if targets is not None:

            if x.is_cuda:
                self.mse_loss = self.mse_loss.cuda()
                self.bce_loss = self.bce_loss.cuda()
                self.ce_loss = self.ce_loss.cuda()

            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(
                pred_boxes=pred_boxes.cpu().data,
                pred_conf=pred_conf.cpu().data,
                pred_cls=pred_cls.cpu().data,
                target=targets.cpu().data,
                anchors=scaled_anchors.cpu().data,
                num_anchors=nA,
                num_classes=self.num_classes,
                grid_size=nG,
                ignore_thres=self.ignore_thres,
                img_dim=self.image_dim,
            )

            nProposals = int((pred_conf > 0.5).sum().item())
            recall = float(nCorrect / nGT) if nGT else 1
            precision = float(nCorrect / nProposals)

            # Handle masks
            mask = Variable(mask.type(ByteTensor))
            conf_mask = Variable(conf_mask.type(ByteTensor))

            # Handle target variables
            tx = Variable(tx.type(FloatTensor), requires_grad=False)
            ty = Variable(ty.type(FloatTensor), requires_grad=False)
            tw = Variable(tw.type(FloatTensor), requires_grad=False)
            th = Variable(th.type(FloatTensor), requires_grad=False)
            tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
            tcls = Variable(tcls.type(LongTensor), requires_grad=False)

            # Get conf mask where gt and where there is no gt
            conf_mask_true = mask
            conf_mask_false = conf_mask - mask

            # Mask outputs to ignore non-existing objects
            loss_x = self.mse_loss(x[mask], tx[mask])
            loss_y = self.mse_loss(y[mask], ty[mask])
            loss_w = self.mse_loss(w[mask], tw[mask])
            loss_h = self.mse_loss(h[mask], th[mask])
            loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false])\
                      + self.bce_loss(pred_conf[conf_mask_true], tconf[conf_mask_true])
            loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask],
                                               torch.argmax(tcls[mask], 1))
            loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

            return (
                loss,
                loss_x.item(),
                loss_y.item(),
                loss_w.item(),
                loss_h.item(),
                loss_conf.item(),
                loss_cls.item(),
                recall,
                precision,
            )

        else:
            # If not in training phase return predictions
            output = torch.cat(
                (
                    pred_boxes.view(nB, -1, 4) * stride,
                    pred_conf.view(nB, -1, 1),
                    pred_cls.view(nB, -1, self.num_classes),
                ),
                -1,
            )
            return output
Beispiel #34
0
 def fliphor(self, inputs):
     inv_idx = torch.arange(inputs.size(3) - 1, -1,
                            -1).long()  # N x C x H x W
     return inputs.index_select(3, inv_idx)
Beispiel #35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu_ids", default='0', type=str)
    parser.add_argument(
        "--bert_config_file",
        default=
        'check_points/pretrain_models/bert_wwm_ext_base/bert_config.json',
        type=str,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default='check_points/pretrain_models/bert_wwm_ext_base/vocab.txt',
        type=str,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--init_restore_dir",
        required=True,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--input_dir", required=True, default='dataset/CHID')
    parser.add_argument(
        "--output_dir",
        required=True,
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument(
        "--predict_file",
        required=True,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument('--output_file',
                        type=str,
                        default='predictions_test.json')

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=64,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--max_num_choices",
        default=10,
        type=int,
        help=
        "The maximum number of cadicate answer,  shorter than this will be padded."
    )
    parser.add_argument("--predict_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        '--fp16',
        default=True,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")

    args = parser.parse_args()
    print(args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    print("device: {}, distributed training: {}, 16-bits training: {}".format(
        device, bool(args.local_rank != -1), args.fp16))

    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)

    test_example_file = os.path.join(
        args.input_dir,
        'test_examples_{}.pkl'.format(str(args.max_seq_length)))
    test_feature_file = os.path.join(
        args.input_dir,
        'test_features_{}.pkl'.format(str(args.max_seq_length)))

    eval_features = generate_input(args.predict_file,
                                   None,
                                   test_example_file,
                                   test_feature_file,
                                   tokenizer,
                                   max_seq_length=args.max_seq_length,
                                   max_num_choices=args.max_num_choices,
                                   is_training=False)

    # Prepare model
    if 'albert' in args.bert_config_file:
        bert_config = ALBertConfig.from_json_file(args.bert_config_file)
        model = ALBertForMultipleChoice(bert_config,
                                        num_choices=args.max_num_choices)
    else:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
        model = BertForMultipleChoice(bert_config,
                                      num_choices=args.max_num_choices)
    model = model.to(device)
    if args.init_restore_dir.endswith('.pth') or \
            args.init_restore_dir.endswith('.pt') or \
            args.init_restore_dir.endswith('.bin'):
        pass
    else:
        args.init_restore_dir = glob(args.init_restore_dir + '*.pth')
        assert len(args.init_restore_dir) == 1
        args.init_restore_dir = args.init_restore_dir[0]
    torch_init_model(model, args.init_restore_dir)
    if args.fp16:
        model = model.half()

    print("***** Running predictions *****")
    print("Num split examples = %d", len(eval_features))
    print("Batch size = %d", args.predict_batch_size)

    all_example_ids = [f.example_id for f in eval_features]
    all_tags = [f.tag for f in eval_features]
    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_masks = torch.tensor([f.input_masks for f in eval_features],
                                   dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_choice_masks = torch.tensor([f.choice_masks for f in eval_features],
                                    dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids,
                              all_choice_masks, all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    print("Start evaluating")
    for input_ids, input_masks, segment_ids, choice_masks, example_indices in tqdm(
            eval_dataloader, desc="Evaluating", disable=None):
        if len(all_results) == 0:
            print('shape of input_ids: {}'.format(input_ids.shape))
        input_ids = input_ids.to(device)
        input_masks = input_masks.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_logits = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_masks,
                                 labels=None)
        for i, example_index in enumerate(example_indices):
            logits = batch_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                RawResult(unique_id=unique_id,
                          example_id=all_example_ids[unique_id],
                          tag=all_tags[unique_id],
                          logit=logits))
    else:
        print("prediction is over")

    print('decoder raw results')
    tmp_predict_file = os.path.join(args.output_dir,
                                    "test_raw_predictions.pkl")
    output_prediction_file = os.path.join(args.output_dir, args.output_file)
    results = get_final_predictions(all_results, tmp_predict_file, g=True)
    write_predictions(results, output_prediction_file)
    print('predictions saved to {}'.format(output_prediction_file))
def find_top_rpn_proposals(
    proposals,
    pred_objectness_logits,
    images,
    nms_thresh,
    pre_nms_topk,
    post_nms_topk,
    min_box_side_len,
    training,
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps if `training` is True,
    otherwise, returns the highest `post_nms_topk` scoring proposals for each
    feature map.

    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        images (ImageList): Input images as an :class:`ImageList`.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.

    Returns:
        proposals (list[Instances]): list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for image i.
    """
    image_sizes = images.image_sizes  # in (h, w) order
    num_images = len(image_sizes)
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    batch_idx = torch.arange(num_images, device=device)
    for level_id, proposals_i, logits_i in zip(
        itertools.count(), proposals, pred_objectness_logits
    ):
        Hi_Wi_A = logits_i.shape[1]
        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)

        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i.sort(descending=True, dim=1)
        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
        topk_idx = idx[batch_idx, :num_proposals_i]

        # each is N x topk
        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=1)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For each image, run a per-level NMS, and choose topk results.
    results = []
    for n, image_size in enumerate(image_sizes):
        boxes = Boxes(topk_proposals[n])
        scores_per_img = topk_scores[n]
        boxes.clip(image_size)

        # filter empty boxes
        keep = boxes.nonempty(threshold=min_box_side_len)
        lvl = level_ids
        if keep.sum().item() != len(boxes):
            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], level_ids[keep]

        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
        # During training, topk is over the proposals from *all* images in the training batch.
        # During testing, it is over the proposals for each image separately.
        # As a result, the training behavior becomes batch-dependent,
        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
        keep = keep[:post_nms_topk]

        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores_per_img[keep]
        results.append(res)
    return results
Beispiel #37
0
def mask_cross_entropy(pred, target, label):
    num_rois = pred.size()[0]
    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
    pred_slice = pred[inds, label].squeeze(1)
    return F.binary_cross_entropy_with_logits(
        pred_slice, target, reduction='elementwise_mean')[None]
Beispiel #38
0
def compute_log_R_O_nfac(log_p, so_perms=None):
    """
    Computes all first and second order log ratio's by computing P(S)
    for all second order sets leaving two elements out of S
    where the individual P(S) are computed by naive enumeration of all permutations
    This is inefficient especially for large sample sizes but can be used
    to validate alternative implementations
    """

    k = log_p.size(-1)
    if k == 1:
        # If k = 1, second order is not defined, and first order
        # P(S\{s}) / P(S) = P{{}} / P({s}) = 1 / p_s
        # log (1 / p_s) = - log p_s
        return -log_p[...], None

    if so_perms is None:
        if k in SO_PERM_CACHE:
            so_perms = SO_PERM_CACHE[k]
        else:
            so_perms = all_2nd_order_perms(torch.arange(k, dtype=torch.long), device=log_p.device)
            SO_PERM_CACHE[k] = so_perms

        # perm_ids = all_perms(torch.arange(k - 2, dtype=torch.long), device=log_p.device)

    keys, rest = so_perms
    first, second = torch.unbind(keys, -1)

    norm1 = log1mexp(log_p[..., first])
    norm2 = norm1 + log1mexp(log_p[..., second] - norm1)

    # Second order leave out log_probabilities
    log_P2s = log_p.new_zeros(log_p.size(0), k, k)

    if k > 2:  # For k = 2, thre remainder set is empty with log probability zero
        # Index to get
        # (batch_size, num_second_orders, num_perms, rest=k-2)
        log_p_rest = log_p[..., rest] - norm2[..., None, None]

        # (batch_size, num_second_orders, num_perms)
        logprobs = log_pl_rec(log_p_rest, -1)

        # (batch_size, num_second_orders)
        log_P = logprobs.logsumexp(-1)


        log_P2s[:, first, second] = log_P
        log_P2s[:, second, first] = log_P

    # Compute first order log_P
    log_P1s = torch.zeros_like(log_p)
    for i in range(k):
        # P(S) = sum_{s in S} p(s) P^{D\s}(S\s)
        log_p_without_i = torch.cat((log_p[:, :i], log_p[:, i + 1:]), -1) - log1mexp(log_p[:, i, None])
        log_P2s_without_i = torch.cat((log_P2s[:, i, :i], log_P2s[:, i, i + 1:]), -1)
        log_P1s[:, i] = (log_p_without_i + log_P2s_without_i).logsumexp(-1)
        log_P2s[:, i, i] = log_P1s[:, i]

    log_P = (log_p + log_P1s).logsumexp(-1)

    # Bit hacky but if we have (allmost) all probability mass on a few
    # categories we have numerical problems since the probability for other classes
    # is basically zero
    # In this case we can just compute an exact gradient
    # Whereas we can just compute an exact gradient by setting
    # We choose this where the probability mass > 1 - 1e-5, so approx logprob > -1e-5
    is_exact = log_p.logsumexp(-1) > -1e-5
    
    log_R1 = log_P1s - log_P[..., None]
    log_R2 = log_P2s - log_P1s[..., None]

    log_R1[is_exact] = 0
    log_R2[is_exact] = 0

    assert not torch.isnan(log_R1).any()
    assert not torch.isnan(log_R2).any()

    return log_R1, log_R2
    def test_step_result_preds(self, batch, batch_idx, optimizer_idx=None):
        x, y = batch
        x = x.view(x.size(0), -1)
        y_hat = self(x)

        loss_test = self.loss(y, y_hat)

        # acc
        labels_hat = torch.argmax(y_hat, dim=1)
        test_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
        test_acc = torch.tensor(test_acc)

        test_acc = test_acc.type_as(x)

        # Do regular EvalResult Logging
        result = EvalResult(checkpoint_on=loss_test)
        result.log('test_loss', loss_test)
        result.log('test_acc', test_acc)

        batch_size = x.size(0)
        lst_of_str = [random.choice(['dog', 'cat']) for i in range(batch_size)]
        lst_of_int = [random.randint(500, 1000) for i in range(batch_size)]
        lst_of_lst = [[x] for x in lst_of_int]
        lst_of_dict = [{k: v} for k, v in zip(lst_of_str, lst_of_int)]

        # This is passed in from pytest via parameterization
        option = getattr(self, 'test_option', 0)
        prediction_file = getattr(self, 'prediction_file', 'predictions.pt')

        lazy_ids = torch.arange(batch_idx * self.batch_size, batch_idx * self.batch_size + x.size(0))

        # Base
        if option == 0:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('preds', labels_hat, prediction_file)

        # Check mismatching tensor len
        elif option == 1:
            result.write('idxs', torch.cat((lazy_ids, lazy_ids)), prediction_file)
            result.write('preds', labels_hat, prediction_file)
        
        # write multi-dimension
        elif option == 2:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('preds', labels_hat, prediction_file)
            result.write('x', x, prediction_file)
        
        # write str list
        elif option == 3:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('vals', lst_of_str, prediction_file)

        # write int list
        elif option == 4:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('vals', lst_of_int, prediction_file)

        # write nested list
        elif option == 5:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('vals', lst_of_lst, prediction_file)

        # write dict list
        elif option == 6:
            result.write('idxs', lazy_ids, prediction_file)
            result.write('vals', lst_of_dict, prediction_file)

        return result
    def forward(self, features, labels=None, mask=None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """

        device = (torch.device('cuda')
                  if features.is_cuda
                  else torch.device('cpu'))

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')

        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)

        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError(
                    'Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)

        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            self.temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss
Beispiel #41
0
def make_ix_like(input, dim=0):
    d = input.size(dim)
    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
    view = [1] * input.dim()
    view[0] = -1
    return rho.view(view).transpose(0, dim)
Beispiel #42
0
node_embs = torch.randn((n_nodes, emb_dims), requires_grad=True)
attn_dist = None
node_attns = [None] * n_nodes
trans_attns = [dict() for _ in range(n_nodes)]
trans_norm_factors = [dict() for _ in range(n_nodes)]
node_outs = [None] * n_nodes
state_fn = torch.nn.Linear(emb_dims, emb_dims)
transform_fn = [torch.nn.Linear(emb_dims, emb_dims) for _ in range(n_nodes)]
a2w_fn = lambda x: x

# input
input = torch.randn((batch_size, emb_dims), requires_grad=True)

# for master input
node_outs[0] = (input, torch.arange(batch_size))

out_nei_ids = nodes[master_input]['out_neis']
out, subbat_idx = node_outs[0]
state = state_fn(out)
out_nei_embs = node_embs.index_select(0, torch.tensor(out_nei_ids))
transition = torch.tensordot(state, out_nei_embs, dims=([1], [1])).softmax(1)
attn_sent = transition

attn_dist = torch.zeros((subbat_idx.size(0), n_nodes)).index_copy_(1, torch.tensor(out_nei_ids), attn_sent.data)
V, I = attn_dist.topk(k)
mask = torch.zeros_like(attn_dist).scatter_(1, I, torch.ones(I.size()))
mask_gt = torch.gt(attn_dist, epsilon).float()
mask.mul_(mask_gt)
V_gt = torch.gt(V, epsilon).float()
V.mul_(V_gt)
Beispiel #43
0
 def make_dxx_lut(layout,
                  block,
                  step,
                  trans,
                  device,
                  transform=lambda idx: idx):
     # load-balancing
     _empty = torch.tensor([], dtype=torch.int64, device=layout.device)
     segments = _empty.clone()
     column = _empty.clone()
     depth = _empty.clone()
     lockid = _empty.clone()
     maxid = _empty.clone()
     offsets = _empty.clone()
     current_offset = 0
     current_maxid = 0
     for z in range(layout.size(0)):
         if trans:
             sizes = torch.sum(layout[z, :, :], 1)
         else:
             sizes = torch.sum(layout[z, :, :], 0)
         z_segments, z_column, z_lockid, z_maxid, z_offsets = _sparse_matmul.load_balance(
             sizes, block)
         z_depth = z * torch.ones_like(z_segments)
         z_lockid[z_lockid > 0] += current_maxid
         current_maxid = z_lockid.max()
         # concatenate depth
         segments = torch.cat((segments, z_segments))
         column = torch.cat((column, z_column))
         depth = torch.cat((depth, z_depth))
         maxid = torch.cat((maxid, z_maxid))
         offsets = torch.cat((offsets, current_offset + z_offsets))
         lockid = torch.cat((lockid, z_lockid))
         current_offset += layout[z, :, :].sum()
     segments *= step
     # pointer increments
     if trans:
         nnz = layout.nonzero()
     else:
         nnz = layout.transpose(1, 2).nonzero()
     num_blocks = nnz.size(0)
     offsets = torch.min(offsets,
                         (num_blocks - 1) * torch.ones_like(offsets))
     idx = transform(nnz[:, 2] * block)
     xincs = idx.clone()
     xincs[1:] -= idx[:-1]
     # divide block into multiple steps
     div = block // step
     xincs = xincs.view(-1, 1).repeat(1, div)
     xincs[:, 1:] = step
     xincs[:, 0] -= (div - 1) * step
     # first increment for each reduction is actually the offset
     xincs[offsets[segments > 0], 0] = idx[offsets[segments > 0]]
     xincs = xincs.view(-1)
     # block-mode input increments
     if trans:
         widx = torch.arange(num_blocks)
     else:
         widx = _empty.clone()
         current_offset = 0
         for z in range(layout.size(0)):
             layoutw = layout[z, :, :].clone()
             msum = layoutw.sum()
             layoutw[layoutw > 0] = 1 + torch.arange(msum)
             widx = torch.cat(
                 (widx, current_offset + layoutw.T[layoutw.T > 0] - 1))
             current_offset += msum
     widx = widx
     wincs = widx * block * block
     wincs[1:] -= widx[:-1] * block * block
     wincs = wincs.view(-1, 1).repeat(1, div)
     if trans:
         wincs[:, 1:] = step
         wincs[:, 0] -= (div - 1) * step
     else:
         wincs[:, 1:] = step * block
         wincs[:, 0] -= (div - 1) * step * block
     wincs[offsets[segments > 0], 0] = widx[offsets[segments > 0]]
     wincs = wincs.view(-1)
     # adjust offset and segment size
     offsets *= 2 * div
     segments *= div
     # create header
     width = column.size(0)
     offsets += 6 * width
     header = torch.stack((offsets, segments, column, depth, lockid, maxid),
                          dim=1).view(-1).contiguous()
     incs = torch.stack((xincs, wincs), dim=1).view(-1).contiguous()
     incs = torch.cat(
         (incs, torch.zeros(2, device=incs.device, dtype=incs.dtype)))
     # create lut
     lut = torch.cat((header, incs))
     lut = lut.type(torch.int32).to(device)
     # create locks
     num_locks = max(1, lockid.max())
     return lut, num_locks, width, None
Beispiel #44
0
    def forward(self, pts, fts, qrs):
        N = pts.shape[0]  # batch size

        point_num = pts.shape[1]
        # xconv operation
        _, indices_dilated = self.knn_indices_general(qrs, pts, True)
        indices = indices_dilated[:, :, ::self.
                                  dilation, :]  # indices of K neaerest(dilation: d) points
        indices = (
            indices.view(-1, 2)[:, 1].cpu() +
            torch.arange(0, N * point_num, point_num).view(-1, 1).repeat(
                1, self.P * self.K).view(-1)).cpu().numpy()
        if self.sorting_method is not None:
            raise NotImplementedError

        nn_pts = (pts.contiguous().view(-1, 3))[indices].view(
            N, self.P, self.K, 3)  # coordinates of nearest-neighbour points
        nn_pts_center = qrs.unsqueeze(
            dim=2)  # (N, P, 1, 3) # coordinates of queries
        nn_pts_local_origin = nn_pts - nn_pts_center  # (N, P, K, 3) # relative coordinates

        knn_pts_len = torch.norm(
            nn_pts_local_origin, dim=3,
            keepdim=False).detach()  # (N,P,K) # stop gradient here!
        nn_pts_max_len = torch.unsqueeze(torch.mean(knn_pts_len,
                                                    dim=-1,
                                                    keepdim=True),
                                         dim=-1)  # (N,P,1,1)
        nn_pts_local = nn_pts_local_origin / nn_pts_max_len

        nn_fts_from_pts_0 = self._modules['BN1'].forward(
            (self._modules['dense1'].forward(nn_pts_local.view(-1, 3)))).view(
                N, self.P, self.K, self.C_pts_fts)
        nn_fts_from_pts = self._modules['BN2'].forward(
            self._modules['dense2'].forward(nn_fts_from_pts_0.view(-1, self.C_pts_fts))).view(N, self.P, self.K,\
            self.C_pts_fts)  # shape: (N,P,K,C_pts_fts)

        if fts is None:
            nn_fts_input = nn_fts_from_pts  # no concat!
        else:
            nn_fts_from_prev = (fts.contiguous().view(
                N * point_num,
                -1))[indices].contiguous().view(N, self.P, self.K,
                                                -1)  # the F matrix
            nn_fts_input = torch.cat([nn_fts_from_pts, nn_fts_from_prev],
                                     dim=-1)

        if self.with_X_transformation:
            ######################## X-transformation #########################
            nn_pts_local = nn_pts_local.transpose(1,
                                                  3).transpose(2,
                                                               3)  # (N,3,P,K)
            X_0 = self._modules["x_trans_conv1"].forward(nn_pts_local)
            X_0_KK = X_0.view(N, self.K, self.K,
                              self.P).transpose(1, 2).transpose(2,
                                                                3)  # (N,K,P,K)

            X_1 = self._modules['x_trans_depthConv1'].forward(
                X_0_KK)  # (N,K*K,P,1)
            X_1_KK = X_1.view(N, self.K, self.K,
                              self.P).transpose(1, 2).transpose(2,
                                                                3)  # (N,K,P,K)
            X_2 = self._modules['x_trans_depthConv2'].forward(
                X_1_KK)  # (N,K*K,P,1)
            X_2_KK = X_2.view(N, self.K, self.K,
                              self.P).transpose(1, 2).transpose(2,
                                                                3)  # (N,K,P,K)
            X_2_KK = X_2_KK.transpose(1, 2).transpose(
                2, 3)  # (N,P,K,K) # output of Step 4 of algorithm 1
            fts_X = torch.matmul(
                X_2_KK, nn_fts_input)  # output of Step 5 of algorithm 1
            ###################################################################
        else:
            fts_X = nn_fts_input

        fts_conv_3d = self._modules['fts_conv'].forward(
            fts_X.transpose(1, 3).transpose(2, 3)).transpose(
                1, 2).contiguous().view(-1, self.C)
        fts_conv_3d = self._modules["fts_conv_BN"].forward(fts_conv_3d).view(
            N, self.P, self.C)  # (N,P,C)

        if self.late_bn:
            raise NotImplementedError

        if self.with_global:
            fts_global = self._modules['dense3'].forward(qrs)
            return torch.cat([fts_global, fts_conv_3d], dim=-1)
        else:
            return fts_conv_3d
Beispiel #45
0
    def summarization_step(self, lambda_coeff):
        """
        Machine translation step.
        Can also be used for denoising auto-encoding.
        """
        assert lambda_coeff >= 0
        if lambda_coeff == 0:
            return
        params = self.params
        self.encoder.train()
        self.decoder.train()

        (table_entities, table_types, table_values, table_feats, table_labels, summaries, summary_labels) = self.get_batch('sm')
        enc_x1, enc_xlen = table_entities
        enc_x2, _ = table_types
        enc_x3, _ = table_values
        enc_x4, _ = table_feats
        enc_label, _ = table_labels

        dec_x, dec_xlen = summaries

        seq_length, batch_size = dec_x.size()

        # target words to predict
        alen = torch.arange(dec_xlen.max(), dtype=torch.long, device=dec_xlen.device)
        pred_mask = alen[:, None] < dec_xlen[None] - 1  # do not predict anything given the last target word

        dec_y = dec_x[1:].masked_select(pred_mask[:-1])
        assert len(dec_y) == (dec_xlen - 1).sum().item()

        # cuda
        if params.cuda:
            enc_x1, enc_x2, enc_x3, enc_x4, enc_xlen = to_cuda(enc_x1, enc_x2, enc_x3, enc_x4, enc_xlen)
            dec_x, dec_xlen, dec_y = to_cuda(dec_x, dec_xlen, dec_y)
        
        # encode source sentence
        encoder_output = self.encoder('fwd', x1=enc_x1, x2=enc_x2, x3=enc_x3, x4=enc_x4, lengths=enc_xlen)

        if params.sm_step_with_cs_proba:
            scores = self.encoder('score', tensor=encoder_output) 
            encoder_output = encoder_output * scores

        encoder_output = encoder_output.transpose(0, 1)

        # decode target sentence
        decoder_output = self.decoder('fwd', x=dec_x, lengths=dec_xlen, causal=True, 
                                      src_enc=encoder_output, src_len=enc_xlen)

        _, loss = self.decoder('predict', tensor=decoder_output, pred_mask=pred_mask, y=dec_y)

        self.stats['sm'].append(loss.item())
        loss = lambda_coeff * loss

        # optimize
        self.optimize(loss, ['encoder', 'decoder'])

        # number of processed sentences / words
        self.n_sentences += params.batch_size
        self.stats['processed_s'] += dec_xlen.size(0)
        self.stats['processed_w'] += (dec_xlen - 1).sum().item()
        self.stats['lambda_sm'] = lambda_coeff
Beispiel #46
0
def kNN(args, C, model, average, trainloader, testloader, K, recompute_memory=0):
    model.eval()
    model_time = AverageMeter()
    cluster_time = AverageMeter()
    total = 0

    testsize = testloader.dataset.__len__()
    ndata = trainloader.dataset.__len__()

    if recompute_memory:
        trainFeatures = torch.zeros(ndata, args.low_dim).cuda()
    else:
        trainFeatures = average.memory  # (num_samples, low_dim)

    # this is cifar10
    # trainLabels = torch.tensor(trainloader.dataset.targets).long().cuda()

    # this is for UCF101 or Kinetics
    trainLabels = torch.tensor([sample['label'] for sample in trainloader.dataset.data]).long().cuda()

    if recompute_memory:
        print('\nRecomputing memory bank....')
        # use test transform to go through all train samples and retrieve features as memory
        # transform_bak = trainloader.dataset.transform
        # trainloader.dataset.transform = testloader.dataset.transform
        temploader = torch.utils.data.DataLoader(trainloader.dataset,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.n_threads,
                                                 pin_memory=True)
        memory_idx = torch.arange(args.batch_size * args.clips_num).view(args.clips_num, args.batch_size).t().cuda()
        batchSize = args.batch_size
        with torch.no_grad():
            for batch_idx, (inputs, _, indices) in enumerate(temploader):
                inputs = torch.cat(inputs, dim=0)
                inputs = inputs.cuda()

                bs = inputs.size(0)

                _, _, features = model(inputs)

                if batch_idx == len(temploader) - 1:
                    batch_size = bs // args.clips_num
                    memory_idx = torch.arange(bs).view(args.clips_num, batch_size).t().cuda()
                    f_means = torch.mean(features[memory_idx], dim=1)
                    trainFeatures[batch_idx * batchSize:, :] = f_means
                else:
                    f_means = torch.mean(features[memory_idx], dim=1)  # (batchSize, dim)
                    trainFeatures[batch_idx * batchSize : (batch_idx+1) * batchSize, :] = f_means
        # trainloader.dataset.transform = transform_bak
        print('Finished!')

    top1 = 0
    top5 = 0
    # save plt distribution
    # Yd = torch.zeros(testsize, K).cuda()
    # NN_labels = torch.zeros(testsize, K).long().cuda()
    # labels = torch.zeros(testsize).long().cuda()

    with torch.no_grad():
        retrieval_one_hot = torch.zeros(K, C).cuda()
        for batch_idx, (inputs, targets, _) in enumerate(testloader):
            end = time.time()
            targets = targets.cuda()
            inputs = inputs.cuda()

            batchSize = inputs.size(0)
            _, _, features = model(inputs)
            total += targets.size(0)

            model_time.update(time.time() - end)
            end = time.time()

            # dist = pearson_coefficient_bank(features, trainFeatures.t())
            dist = torch.mm(features, trainFeatures.t())
            yd, yi = dist.topk(K, dim=1, largest=True, sorted=True)
            candidates = trainLabels.view(1, -1).expand(batchSize, -1)
            retrieval = torch.gather(candidates, 1, yi)

            # if batch_idx == 0:
            #     show_distribution(yd.cpu(), retrieval.cpu(), targets.cpu())
            # if batch_idx < len(testloader)-1:
            #     Yd[batch_idx*batchSize : (batch_idx+1)*batchSize, :] = yd
            #     NN_labels[batch_idx*batchSize : (batch_idx+1)*batchSize, :] = retrieval
            #     labels[batch_idx*batchSize : (batch_idx+1)*batchSize] = targets
            # else:
            #     Yd[batch_idx*batchSize:, :] = yd
            #     NN_labels[batch_idx*batchSize:, :] = retrieval
            #     labels[batch_idx*batchSize:] = targets

            retrieval_one_hot.resize_(batchSize * K, C).zero_()
            retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) # inverse operation of torch.gather
            yd_transform = torch.exp(torch.div(yd, args.nce_t)) # apply softmax with temperature for (non-parametric logits)

            probs = torch.sum(torch.mul(retrieval_one_hot.view(batchSize, -1 , C), yd_transform.view(batchSize, -1, 1)), 1)
            _, predictions = probs.sort(1, True)

            # find which predictions match the target
            correct = predictions.eq(targets.view(-1, 1))
            cluster_time.update(time.time() - end)

            top1 = top1 + correct.narrow(1, 0, 1).sum().item()
            top5 = top5 + correct.narrow(1, 0, 5).sum().item()

            if (batch_idx+1) % 100 == 0:
                print('Test [{}/{}]\t'
                      'Model time: {model_time.val:.3f} ({model_time.avg:.3f})\t'
                      'Cluster time: {cluster_time.val:.3f} ({cluster_time.avg:.3f})\t'
                      'Top1: {:.2f} Top5: {:.2f}'.format(batch_idx+1, len(testloader), top1*100./total, top5*100./total,
                                                         model_time=model_time, cluster_time=cluster_time))

    print(top1*100./total)

    return top1*100./total, top5*100./total
Beispiel #47
0
    def fit(self, training_data,
            loss_func='kl',
            p_ij=None,
            pretrain=False,
            epochs=10,
            verbose=False,
            optimizer=torch.optim.Adam,
            batch_size=500,
            learning_rate=0.01):
        
        assert training_data.shape[1] == self.input_dim, "Input training data must be same shape as training `num_inputs`"
        
        self.p_ij = p_ij
        self._epochs = epochs
        
        if pretrain:
            self.pretrain(training_data, epochs=5, verbose=verbose, batch_size=batch_size)
        
        if self.p_ij is None:
            self.p_ij = p_ij_sym(training_data.detach().cpu().numpy(), self.perplexity, verbose=verbose).toarray()
            
        dataset = torch.utils.data.TensorDataset(training_data, torch.arange(training_data.shape[0]))
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optim = optimizer(self.parameters(), lr=learning_rate)
        
        if verbose:
            print('{time}: Beginning training for {epochs} epochs'.format(
                time=datetime.datetime.now(),
                epochs=epochs))
            
        loss_func = {
            'kl': kullback_leibler_loss,
            'kl_rev': kullback_leibler_reverse_loss,
            'js': jensen_shannon_loss,
            'frob': frobenius_loss,
            #'bat': bhattacharyya_loss,
            'tot': total_variational_loss
        }[loss_func]
        
        for epoch in range(epochs):
            running_loss = 0
            for batch, data in enumerate(dataloader):
                
                features, indices = data
                
                p = submatrix(self.p_ij, indices.numpy())
                p = p / p.sum()
                
                if epoch < 10:
                    # exaggeration test
                    exaggeration = 10.
                    p *= exaggeration
                    
                if self.use_cuda:
                    features = features.cuda()
                    p = p.cuda()
                    
                optim.zero_grad()
                
                q = q_ij(self(features), self.alpha)
                q = q / q.sum()
                
                loss = loss_func(p, q)
                
                if epoch < 10:
                   # exaggeration test
                   loss = loss / exaggeration - np.log(exaggeration)
                
                loss.backward()
                optim.step()
                    
                running_loss += loss.item()
                
            if verbose:
                print('{time}: Loss after epoch {ep}: {rloss}'.format(
                    time=datetime.datetime.now(),
                    ep=epoch+1,
                    rloss=running_loss))

        if verbose:
            print('{time}: Finished training'.format(
                time=datetime.datetime.now()))
    def _get_indexes_ce(predictions, targets, current_max_prob):

        predictions = torch.nn.functional.softmax(predictions, dim=1)

        return predictions[torch.arange(predictions.size(0)), targets] < current_max_prob
Beispiel #49
0
    def forward(self, fpn_fms,rpn_rois, rpn_rois_inds = None, im_info = None, gt_boxes=None, isEval = False, flip_fms = None, extra = {}):
        
        if self.training or isEval:
            with torch.no_grad():
                rcnn_rois, rcnn_labels, rcnn_bbox_targets = fpn_roi_target(
                    rpn_rois, rpn_rois_inds, im_info, gt_boxes, top_k=2)
        else:
            rcnn_rois = rpn_rois

        pred_ref_pred_cls, pred_ref_pred_delta, pred_cls_unrefined, pred_delta_unrefined, \
        pool_features = self._recursive_forward(fpn_fms, rcnn_rois, keep_pool_feature = True)
        

        if self.training or isEval:

            #loss_rcnn = emd_loss_multi(pred_delta_unrefined, pred_cls_unrefined,rcnn_bbox_targets,rcnn_labels,top_k=2)
            #loss_ref = emd_loss_multi(pred_ref_pred_delta, pred_ref_pred_cls,rcnn_bbox_targets,rcnn_labels,top_k=2)

            loss0 = emd_loss(
                        pred_delta_unrefined[0], pred_cls_unrefined[0],
                        pred_delta_unrefined[1], pred_cls_unrefined[1],
                        rcnn_bbox_targets, rcnn_labels)
            loss1 = emd_loss(
                        pred_delta_unrefined[1], pred_cls_unrefined[1],
                        pred_delta_unrefined[0], pred_cls_unrefined[0],
                        rcnn_bbox_targets, rcnn_labels)

            loss2 = emd_loss(
                        pred_ref_pred_delta[0], pred_ref_pred_cls[0],
                        pred_ref_pred_delta[1], pred_ref_pred_cls[1],
                        rcnn_bbox_targets, rcnn_labels)
            loss3 = emd_loss(
                        pred_ref_pred_delta[1], pred_ref_pred_cls[1],
                        pred_ref_pred_delta[0], pred_ref_pred_cls[0],
                        rcnn_bbox_targets, rcnn_labels)
                        
            loss_rcnn = torch.cat([loss0, loss1], axis=1)
            loss_ref = torch.cat([loss2, loss3], axis=1)
            loss_rcnn = torch.cat([loss0, loss1], axis=1)
            loss_ref = torch.cat([loss2, loss3], axis=1)


            with torch.no_grad():
                _, min_indices_rcnn = loss_rcnn.min(axis=1)
                _, min_indices_ref = loss_ref.min(axis=1)
            loss_rcnn = loss_rcnn[torch.arange(loss_rcnn.shape[0]), min_indices_rcnn]
            loss_rcnn = loss_rcnn.sum()/loss_rcnn.shape[0]
            loss_ref = loss_ref[torch.arange(loss_ref.shape[0]), min_indices_ref]
            loss_ref = loss_ref.sum()/loss_ref.shape[0]


            loss_dict = {}
            loss_dict['loss_rcnn_emd'] = loss_rcnn
            loss_dict['loss_ref_emd'] = loss_ref

            if self.args.flip_JSD:
                if self.args.flip_JSD_0g:
                    with torch.no_grad():
                        f_pred_ref_pred_cls, _, pred_cls_unrefined, _ = self._recursive_forward(flip_fms, rcnn_rois)
                else:
                    f_pred_ref_pred_cls, _ = self._recursive_forward(flip_fms, rcnn_rois)
                loss_flip_JSD = _flip_loss_JSD(F.softmax(pred_ref_pred_cls[0], dim=-1),F.softmax(f_pred_ref_pred_cls[0], dim=-1))
                loss_flip_JSD += _flip_loss_JSD(F.softmax(pred_ref_pred_cls[1], dim=-1),F.softmax(f_pred_ref_pred_cls[1], dim=-1))
                loss_dict['loss_flip_JSD'] = loss_flip_JSD
                
               
            return loss_dict
        else:
            pred_bboxes = None
            for p_cls,p_delta in zip(pred_ref_pred_cls,pred_ref_pred_delta):
                pred_ref_scores = F.softmax(p_cls, dim=-1)
                pred_bbox = restore_bbox(rcnn_rois[:, 1:5], p_delta, True)
                if pred_bboxes is None:
                    pred_bboxes = torch.cat([pred_bbox, pred_ref_scores[:, 1].reshape(-1,1)], dim=1)
                else:
                    pred_bbox = torch.cat([pred_bbox, pred_ref_scores[:, 1].reshape(-1,1)], dim=1)
                    
                    pred_bboxes = torch.cat([pred_bboxes, pred_bbox], dim=0)
            
            #pred_bbox = torch.cat((pred_bbox_0, pred_bbox_1), dim=1).reshape(-1,5)
            return pred_bboxes
def run(proc_id, n_gpus, args, devices, data):
    # Unpack data
    device = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
    train_mask, val_mask, test_mask, n_classes, g = data
    nfeat = g.ndata.pop('feat')
    labels = g.ndata.pop('label')
    in_feats = nfeat.shape[1]

    train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
    val_nid = th.LongTensor(np.nonzero(val_mask)).squeeze()
    test_nid = th.LongTensor(np.nonzero(test_mask)).squeeze()

    # Create PyTorch DataLoader for constructing blocks
    n_edges = g.num_edges()
    train_seeds = np.arange(n_edges)
    if n_gpus > 0:
        num_per_gpu = (train_seeds.shape[0] + n_gpus - 1) // n_gpus
        train_seeds = train_seeds[proc_id * num_per_gpu :
                                  (proc_id + 1) * num_per_gpu \
                                  if (proc_id + 1) * num_per_gpu < train_seeds.shape[0]
                                  else train_seeds.shape[0]]

    # Create sampler
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
        [int(fanout) for fanout in args.fan_out.split(',')])
    dataloader = dgl.dataloading.EdgeDataLoader(
        g,
        train_seeds,
        sampler,
        exclude='reverse_id',
        # For each edge with ID e in Reddit dataset, the reverse edge is e ± |E|/2.
        reverse_eids=th.cat(
            [th.arange(n_edges // 2, n_edges),
             th.arange(0, n_edges // 2)]),
        negative_sampler=NegativeSampler(g, args.num_negs, args.neg_share),
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False,
        pin_memory=True,
        num_workers=args.num_workers)

    # Define model and optimizer
    model = SAGE(in_feats, args.num_hidden, args.num_hidden, args.num_layers,
                 F.relu, args.dropout)
    model = model.to(device)
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[device],
                                        output_device=device)
    loss_fcn = CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
    avg = 0
    iter_pos = []
    iter_neg = []
    iter_d = []
    iter_t = []
    best_eval_acc = 0
    best_test_acc = 0
    for epoch in range(args.num_epochs):
        tic = time.time()

        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.

        tic_step = time.time()
        for step, (input_nodes, pos_graph, neg_graph,
                   blocks) in enumerate(dataloader):
            batch_inputs = nfeat[input_nodes].to(device)
            d_step = time.time()

            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)
            blocks = [block.int().to(device) for block in blocks]
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
            loss = loss_fcn(batch_pred, pos_graph, neg_graph)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            t = time.time()
            pos_edges = pos_graph.num_edges()
            neg_edges = neg_graph.num_edges()
            iter_pos.append(pos_edges / (t - tic_step))
            iter_neg.append(neg_edges / (t - tic_step))
            iter_d.append(d_step - tic_step)
            iter_t.append(t - d_step)
            if step % args.log_every == 0:
                gpu_mem_alloc = th.cuda.max_memory_allocated(
                ) / 1000000 if th.cuda.is_available() else 0
                print(
                    '[{}]Epoch {:05d} | Step {:05d} | Loss {:.4f} | Speed (samples/sec) {:.4f}|{:.4f} | Load {:.4f}| train {:.4f} | GPU {:.1f} MB'
                    .format(proc_id, epoch, step, loss.item(),
                            np.mean(iter_pos[3:]), np.mean(iter_neg[3:]),
                            np.mean(iter_d[3:]), np.mean(iter_t[3:]),
                            gpu_mem_alloc))
            tic_step = time.time()

            if step % args.eval_every == 0 and proc_id == 0:
                eval_acc, test_acc = evaluate(model, g, nfeat, labels,
                                              train_nid, val_nid, test_nid,
                                              device)
                print('Eval Acc {:.4f} Test Acc {:.4f}'.format(
                    eval_acc, test_acc))
                if eval_acc > best_eval_acc:
                    best_eval_acc = eval_acc
                    best_test_acc = test_acc
                print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(
                    best_eval_acc, best_test_acc))
        toc = time.time()
        if proc_id == 0:
            print('Epoch Time(s): {:.4f}'.format(toc - tic))
        if epoch >= 5:
            avg += toc - tic
        if n_gpus > 1:
            th.distributed.barrier()

    if proc_id == 0:
        print('Avg epoch time: {}'.format(avg / (epoch - 4)))
Beispiel #51
0
def interpolate_bilinear(grid,
                         query_points,
                         name='interpolate_bilinear',
                         indexing='ij'):
    """Similar to Matlab's interp2 function.
    Finds values for query points on a grid using bilinear interpolation.
    Args:
    grid: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
    query_points: a 3-D float `Tensor` of N points with shape `[batch, N, 2]`.
    name: a name for the operation (optional).
    indexing: whether the query points are specified as row and column (ij),
      or Cartesian coordinates (xy).
    Returns:
    values: a 3-D `Tensor` with shape `[batch, N, channels]`
    Raises:
    ValueError: if the indexing mode is invalid, or if the shape of the inputs
      invalid.
    """
    if indexing != 'ij' and indexing != 'xy':
        raise ValueError('Indexing mode must be \'ij\' or \'xy\'')

    shape = grid.shape
    if len(shape) != 4:
        msg = 'Grid must be 4 dimensional. Received size: '
        raise ValueError(msg + str(grid.shape))

    batch_size, height, width, channels = grid.shape

    shape = [batch_size, height, width, channels]
    query_type = query_points.dtype
    grid_type = grid.dtype

    num_queries = query_points.shape[1]
    #     print('Num queries', num_queries)

    alphas = []
    floors = []
    ceils = []
    index_order = [0, 1] if indexing == 'ij' else [1, 0]
    #     print(query_points.shape)
    unstacked_query_points = query_points.unbind(2)
    #     print('Squeezed query_points', unstacked_query_points[0].shape, unstacked_query_points[1].shape)

    for dim in index_order:
        queries = unstacked_query_points[dim]

        size_in_indexing_dimension = shape[dim + 1]

        # max_floor is size_in_indexing_dimension - 2 so that max_floor + 1
        # is still a valid index into the grid.
        max_floor = torch.tensor(size_in_indexing_dimension - 2,
                                 dtype=query_type)
        min_floor = torch.tensor(0.0, dtype=query_type)
        maxx = torch.max(min_floor, torch.floor(queries))
        floor = torch.min(maxx, max_floor)
        int_floor = floor.long()
        floors.append(int_floor)
        ceil = int_floor + 1
        ceils.append(ceil)

        # alpha has the same type as the grid, as we will directly use alpha
        # when taking linear combinations of pixel values from the image.
        alpha = torch.tensor(queries - floor, dtype=grid_type)
        min_alpha = torch.tensor(0.0, dtype=grid_type)
        max_alpha = torch.tensor(1.0, dtype=grid_type)
        alpha = torch.min(torch.max(min_alpha, alpha), max_alpha)

        # Expand alpha to [b, n, 1] so we can use broadcasting
        # (since the alpha values don't depend on the channel).
        alpha = torch.unsqueeze(alpha, 2)
        alphas.append(alpha)

    flattened_grid = torch.reshape(grid,
                                   [batch_size * height * width, channels])
    batch_offsets = torch.reshape(
        torch.arange(batch_size) * height * width, [batch_size, 1])

    # This wraps array_ops.gather. We reshape the image data such that the
    # batch, y, and x coordinates are pulled into the first dimension.
    # Then we gather. Finally, we reshape the output back. It's possible this
    # code would be made simpler by using array_ops.gather_nd.
    def gather(y_coords, x_coords, name):
        linear_coordinates = batch_offsets + y_coords * width + x_coords
        gathered_values = torch.gather(flattened_grid.t(), 1,
                                       linear_coordinates)
        return torch.reshape(gathered_values,
                             [batch_size, num_queries, channels])

    # grab the pixel values in the 4 corners around each query point
    top_left = gather(floors[0], floors[1], 'top_left')
    top_right = gather(floors[0], ceils[1], 'top_right')
    bottom_left = gather(ceils[0], floors[1], 'bottom_left')
    bottom_right = gather(ceils[0], ceils[1], 'bottom_right')

    interp_top = alphas[1] * (top_right - top_left) + top_left
    interp_bottom = alphas[1] * (bottom_right - bottom_left) + bottom_left
    interp = alphas[0] * (interp_bottom - interp_top) + interp_top

    return interp
    def forward(self, x, scale, gt_bboxes, gt_labels, original_size=None):
        if self.training:
            img_size = tuple(x.shape[2:])

            # Feature extractor from the base network(e.g. VGG16, ResNet-101)
            feature = self._extract_features(x)

            # Region Proposal Network
            rpn_result = self.rpn(feature, img_size, scale, gt_bboxes[0], gt_labels[0])
            roi, gt_roi_loc, gt_roi_label, rpn_loc_loss, rpn_cls_loss = rpn_result

            # RoI Pooling Layer
            roi_pool_feat = self._roi_pool(feature, roi)

            # bbox regression & classification
            roi_loc, roi_score = self._bbox_regression_and_classification(roi_pool_feat)

            # Faster R-CNN loss
            n_sample = roi_loc.shape[0]
            roi_loc = roi_loc.view(n_sample, -1, 4)
            roi_loc = roi_loc[t.arange(0, n_sample).long().cuda(),
                              at.totensor(gt_roi_label).long()]

            gt_roi_loc = at.totensor(gt_roi_loc)
            gt_roi_label = at.totensor(gt_roi_label).long()

            roi_loc_loss = _bbox_regression_loss(
                roi_loc.contiguous(),
                gt_roi_loc,
                gt_roi_label.data,
                self.roi_sigma
            )

            roi_cls_loss = F.cross_entropy(roi_score, gt_roi_label.cuda())

            # Stack losses
            losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
            losses = losses + [sum(losses)]

            return LossTuple(*losses)
        else:
            with t.no_grad():
                x = at.totensor(x).float()
                img_size = tuple(x.shape[2:])

                # Feature extractor from the base network(e.g. VGG16, ResNet)
                feature = self._extract_features(x)

                # Region Proposal Network
                roi = self.rpn(feature, img_size, scale, None, None)
                # RoI Pooling Layer
                roi_pool_feat = self._roi_pool(feature, roi)

                # bbox regression & classification
                roi_loc, roi_score = self._bbox_regression_and_classification(roi_pool_feat)

                roi_loc = roi_loc.data
                roi_score = roi_score.data
                roi = at.totensor(roi) / scale

                # Convert predictions to bounding boxes in image coordinates.
                # Bounding boxes are scaled to the scale of the input images.
                mean = t.tensor(self.loc_normalize_mean).cuda(). \
                    repeat(self.n_class)[None]
                std = t.tensor(self.loc_normalize_std).cuda(). \
                    repeat(self.n_class)[None]

                roi_loc = (roi_loc * std + mean)
                roi_loc = roi_loc.view(-1, self.n_class, 4)

                roi = roi.view(-1, 1, 4).expand_as(roi_loc)
                bbox = loc2bbox(at.tonumpy(roi).reshape(-1, 4),
                                at.tonumpy(roi_loc).reshape(-1, 4))
                bbox = at.totensor(bbox)
                bbox = bbox.view(-1, self.n_class * 4)

                # clip bbox
                bbox[:, 0::2] = bbox[:, 0::2].clamp(min=0, max=original_size[0])
                bbox[:, 1::2] = bbox[:, 1::2].clamp(min=0, max=original_size[1])

                prob = F.softmax(at.totensor(roi_score), dim=1)

                bbox, label, score = self._suppress(bbox, prob)

                return bbox, label, score
Beispiel #53
0
 def __call__(self, X, y):
     if not len(X.shape) == 2:
         raise ValueError("Expected X to have two dimensions but found %d." %
                          len(X.shape))
     return torch.arange(0, X.shape[1])
Beispiel #54
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
            args.bert_model, str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        all_is_impossibles = torch.tensor(
            [int(f.is_impossible) for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions, all_is_impossibles)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, _ = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model, state_dict=model_state_dict)
    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          args.verbose_logging, True,
                          args.null_score_diff_threshold)
Beispiel #55
0
    def forward(self, x, x_mask, y, hidden):
        x_embedded = self.embed(x)
        y_embedded = self.embed(y)

        B, T = x.size()
        rev_index = torch.arange(T - 1, -1, -1).view(1, -1).expand(B, T).long()
        mask_length = torch.sum(1 - x_mask.data, 1).long().expand_as(rev_index)
        rev_index -= mask_length
        rev_index[rev_index < 0] = 0
        rev_index = Variable(rev_index)

        x_backward = Variable(x.data.new(x.data.size()).fill_(0))
        x_backward.scatter_(1, rev_index, x)
        x_backward_embedded = self.embed(x_backward)

        # encoder
        f_h = hidden[0]
        b_h = hidden[1]
        f_hiddens = []
        b_hiddens = []
        f_cells = []
        b_cells = []
        for i in range(T):
            f_h = self.fencoder(x_embedded[:, i, :], f_h)
            b_h = self.bencoder(x_backward_embedded[:, i, :], b_h)
            f_hiddens.append(f_h[0][-1].unsqueeze(
                1))  # f_h[0][-1]: hidden state of the last layer
            b_hiddens.append(b_h[0][-1].unsqueeze(1))
            f_cells.append(f_h[1][-1].unsqueeze(1))
            b_cells.append(b_h[1][-1].unsqueeze(1))

        f_hiddens = torch.cat(f_hiddens, 1)
        b_hiddens = torch.cat(b_hiddens, 1)
        f_cells = torch.cat(f_cells, 1)
        b_cells = torch.cat(b_cells, 1)
        hiddens = torch.cat([f_hiddens, b_hiddens], 2)
        cells = torch.cat([f_cells, b_cells], 2)

        # decoder
        B_y, T_y = y.size()
        h_mean = torch.mean(hiddens, 1).squeeze(1)
        c_mean = torch.mean(cells, 1).squeeze(1)
        hx, cx = [], []

        for i in range(self.num_layers):
            hx.append(h_mean.clone())
            cx.append(c_mean.clone())

        y_embedded = self.embed(y)

        context = h_mean
        out_hiddens = []

        for i in range(T_y):
            hx, cx = self.decoder(y_embedded[:, i, :], (hx, cx))

            att = self.att_layer(hx[-1].unsqueeze(1).expand_as(hiddens).contiguous()\
                , hiddens.contiguous())
            # code.interact(local=locals())
            context = (hiddens *
                       att.unsqueeze(2).expand_as(hiddens)).sum(1).squeeze(1)
            out_hiddens.append(torch.cat([hx[-1], context], 1).unsqueeze(1))

        out_hiddens = torch.cat(out_hiddens, 1)

        # code.interact(local=locals())
        # output layer
        decoded = self.linear(
            out_hiddens.view(
                out_hiddens.size(0) * out_hiddens.size(1),
                out_hiddens.size(2)))
        decoded = F.log_softmax(decoded)
        return decoded.view(out_hiddens.size(0), out_hiddens.size(1),
                            decoded.size(1)), out_hiddens
def test_neighbor_sampler_on_cora(get_dataset):
    dataset = get_dataset(name='Cora')
    data = dataset[0]

    batch = torch.arange(10)
    loader = NeighborSampler(data.edge_index,
                             sizes=[-1, -1, -1],
                             node_idx=batch,
                             batch_size=10)

    class SAGE(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()

            self.convs = torch.nn.ModuleList()
            self.convs.append(SAGEConv(in_channels, 16))
            self.convs.append(SAGEConv(16, 16))
            self.convs.append(SAGEConv(16, out_channels))

        def batch(self, x, adjs):
            for i, (edge_index, _, size) in enumerate(adjs):
                x_target = x[:size[1]]  # Target nodes are always placed first.
                x = self.convs[i]((x, x_target), edge_index)
            return x

        def full(self, x, edge_index):
            for conv in self.convs:
                x = conv(x, edge_index)
            return x

    model = SAGE(dataset.num_features, dataset.num_classes)

    _, n_id, adjs = next(iter(loader))
    out1 = model.batch(data.x[n_id], adjs)
    out2 = model.full(data.x, data.edge_index)[batch]
    assert torch.allclose(out1, out2, atol=1e-7)

    class GAT(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()

            self.convs = torch.nn.ModuleList()
            self.convs.append(GATConv(in_channels, 16, heads=2))
            self.convs.append(GATConv(32, 16, heads=2))
            self.convs.append(GATConv(32, out_channels, heads=2, concat=False))

        def batch(self, x, adjs):
            for i, (edge_index, _, size) in enumerate(adjs):
                x_target = x[:size[1]]  # Target nodes are always placed first.
                x = self.convs[i]((x, x_target), edge_index)
            return x

        def full(self, x, edge_index):
            for conv in self.convs:
                x = conv(x, edge_index)
            return x

    _, n_id, adjs = next(iter(loader))
    out1 = model.batch(data.x[n_id], adjs)
    out2 = model.full(data.x, data.edge_index)[batch]
    assert torch.allclose(out1, out2, atol=1e-7)
Beispiel #57
0
    def evaluate(self, data_loader, model, task=args.task):
        # reset metrics
        self._reset()

        # switch to evaluate mode
        model.eval()

        entity_ids = torch.arange(end=len(self.vocabs[ENTITY])).to(DEVICE)
        with torch.no_grad():
            for _, data in enumerate(data_loader):
                val_triples = data[TRIPLE]

                # get batch size
                batch_size = val_triples.shape[0]

                all_entities = entity_ids.repeat(batch_size, 1)

                heads, relations, tails = val_triples[:,
                                                      0], val_triples[:,
                                                                      1], val_triples[:,
                                                                                      2]

                # exapnd for all entities
                expanded_heads = heads.reshape(-1, 1).repeat(
                    1,
                    all_entities.size()[1])
                expanded_relations = relations.reshape(-1, 1).repeat(
                    1,
                    all_entities.size()[1])

                expanded_triples = torch.stack(
                    (expanded_heads, expanded_relations, all_entities),
                    dim=2).reshape(-1, val_triples.shape[1])

                if args.demographic_aware:
                    expanded_demographics = data[DEMOGRAPHIC].reshape(
                        -1,
                        1).repeat(1,
                                  all_entities.size()[1]).reshape(-1,
                                                                  1).squeeze()

                if args.prob_embedding:
                    expanded_probabilities = data[PROBABILITY].reshape(
                        -1,
                        1).repeat(1,
                                  all_entities.size()[1]).reshape(-1,
                                                                  1).squeeze()

                # chunk data and predict results
                predicted_tails = []
                for i in range(0, len(expanded_triples), batch_size**2):
                    model_data = {
                        TRIPLE: expanded_triples[i:i + batch_size**2]
                    }

                    if args.demographic_aware:
                        model_data.update({
                            DEMOGRAPHIC:
                            expanded_demographics[i:i + batch_size**2]
                        })

                    if args.prob_embedding:
                        model_data.update({
                            PROBABILITY:
                            expanded_probabilities[i:i + batch_size**2]
                        })

                    predicted_tails.append(model.predict(model_data))

                predicted_tails = torch.cat(predicted_tails,
                                            dim=0).reshape(batch_size, -1)

                # rank results
                self._rank(predicted_tails, tails, task)

        return self._results()
Beispiel #58
0
def heatmaps_to_keypoints(maps: torch.Tensor,
                          rois: torch.Tensor) -> torch.Tensor:
    """
    Extract predicted keypoint locations from heatmaps.

    Args:
        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
            each ROI and each keypoint.
        rois (Tensor): (#ROIs, 4). The box of each ROI.

    Returns:
        Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
        (x, y, logit, score) for each keypoint.

    When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
    we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
    Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
    """
    # The decorator use of torch.no_grad() was not supported by torchscript.
    # https://github.com/pytorch/pytorch/issues/44768
    maps = maps.detach()
    rois = rois.detach()

    offset_x = rois[:, 0]
    offset_y = rois[:, 1]

    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
    widths_ceil = widths.ceil()
    heights_ceil = heights.ceil()

    num_rois, num_keypoints = maps.shape[:2]
    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)

    width_corrections = widths / widths_ceil
    height_corrections = heights / heights_ceil

    keypoints_idx = torch.arange(num_keypoints, device=maps.device)

    for i in range(num_rois):
        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
        roi_map = F.interpolate(maps[[i]],
                                size=outsize,
                                mode="bicubic",
                                align_corners=False).squeeze(
                                    0)  # #keypoints x H x W

        # softmax over the spatial region
        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
        max_score = max_score.view(num_keypoints, 1, 1)
        tmp_full_resolution = (roi_map - max_score).exp_()
        tmp_pool_resolution = (maps[i] - max_score).exp_()
        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
        # so that the scores of objects of different absolute sizes will be more comparable
        roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum(
            (1, 2), keepdim=True)

        w = roi_map.shape[2]
        pos = roi_map.view(num_keypoints, -1).argmax(1)

        x_int = pos % w
        y_int = (pos - x_int) // w

        assert (roi_map_scores[keypoints_idx, y_int,
                               x_int] == roi_map_scores.view(
                                   num_keypoints, -1).max(1)[0]).all()

        x = (x_int.float() + 0.5) * width_corrections[i]
        y = (y_int.float() + 0.5) * height_corrections[i]

        xy_preds[i, :, 0] = x + offset_x[i]
        xy_preds[i, :, 1] = y + offset_y[i]
        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
        xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]

    return xy_preds
Beispiel #59
0
b_size = b.size()
b_size

b.numel() # b中元素总个数,2*3,等价于b.nelement()

# 创建一个和b形状一样的tensor
c = t.Tensor(b_size)
# 创建一个元素为2和3的tensor
d = t.Tensor((2,3))
c, d

c.shape

t.ones(2, 3)
t.zeros(2, 3)
t.arange(1, 6, 2)
t.linspace(1, 10, 3)
t.randn(2, 3, device=t.device('cpu'))
t.randperm(5)
t.eye(2, 3, dtype=t.int)

scalar = t.tensor(3.14159)
print('scalar: %s, shape of sclar: %s' %(scalar, scalar.shape))

vector = t.tensor([1, 2])
print('vector: %s, shape of vector: %s' %(vector, vector.shape))

tensor = t.Tensor(1, 2)
tensor.shape

matrix = t.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
Beispiel #60
0
    def _generate(
        self,
        model,
        sample,
        prefix_tokens=None,
        bos_token=None,
        **kwargs
    ):
        if not self.retain_dropout:
            model.eval()

        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        encoder_input = {
            k: v for k, v in sample['net_input'].items()
            if k != 'prev_output_tokens'
        }

        src_tokens = encoder_input['src_tokens']
        src_lengths = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1)
        input_size = src_tokens.size()
        # batch dimension goes first followed by source lengths
        bsz = input_size[0]
        src_len = input_size[1]
        beam_size = self.beam_size

        if self.match_source_len:
            max_len = src_lengths.max().item()
        else:
            max_len = min(
                int(self.max_len_a * src_len + self.max_len_b),
                # exclude the EOS marker
                model.max_decoder_positions() - 1,
            )

        # compute the encoder output for each beam
        encoder_outs = model.forward_encoder(encoder_input)
        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
        new_order = new_order.to(src_tokens.device).long()
        encoder_outs = model.reorder_encoder_out(encoder_outs, new_order)

        # initialize buffers
        scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0)
        scores_buf = scores.clone()
        tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad)
        tokens_buf = tokens.clone()
        tokens[:, 0] = self.eos if bos_token is None else bos_token
        attn, attn_buf = None, None

        # The blacklist indicates candidates that should be ignored.
        # For example, suppose we're sampling and have already finalized 2/5
        # samples. Then the blacklist would mark 2 positions as being ignored,
        # so that we only finalize the remaining 3 samples.
        blacklist = src_tokens.new_zeros(bsz, beam_size).eq(-1)  # forward and backward-compatible False mask

        # list of completed sentences
        finalized = [[] for i in range(bsz)]
        finished = [False for i in range(bsz)]
        num_remaining_sent = bsz

        # number of candidate hypos per step
        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS

        # offset arrays for converting between different indexing schemes
        bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)
        cand_offsets = torch.arange(0, cand_size).type_as(tokens)

        # helper function for allocating buffers on the fly
        buffers = {}

        def buffer(name, type_of=tokens):  # noqa
            if name not in buffers:
                buffers[name] = type_of.new()
            return buffers[name]

        def is_finished(sent, step, unfin_idx):
            """
            Check whether we've finished generation for a given sentence, by
            comparing the worst score among finalized hypotheses to the best
            possible score among unfinalized hypotheses.
            """
            assert len(finalized[sent]) <= beam_size
            if len(finalized[sent]) == beam_size:
                return True
            return False

        def finalize_hypos(step, bbsz_idx, eos_scores):
            """
            Finalize the given hypotheses at this step, while keeping the total
            number of finalized hypotheses per sentence <= beam_size.

            Note: the input must be in the desired finalization order, so that
            hypotheses that appear earlier in the input are preferred to those
            that appear later.

            Args:
                step: current time step
                bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
                    indicating which hypotheses to finalize
                eos_scores: A vector of the same size as bbsz_idx containing
                    scores for each hypothesis
            """
            assert bbsz_idx.numel() == eos_scores.numel()

            # clone relevant token and attention tensors
            tokens_clone = tokens.index_select(0, bbsz_idx)
            tokens_clone = tokens_clone[:, 1:step + 2]  # skip the first index, which is EOS
            assert not tokens_clone.eq(self.eos).any()
            tokens_clone[:, step] = self.eos
            attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None

            # compute scores per token position
            pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1]
            pos_scores[:, step] = eos_scores
            # convert from cumulative to per-position scores
            pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]

            # normalize sentence-level scores
            if self.normalize_scores:
                eos_scores /= (step + 1) ** self.len_penalty

            cum_unfin = []
            prev = 0
            for f in finished:
                if f:
                    prev += 1
                else:
                    cum_unfin.append(prev)

            sents_seen = set()
            for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), eos_scores.tolist())):
                unfin_idx = idx // beam_size
                sent = unfin_idx + cum_unfin[unfin_idx]

                sents_seen.add((sent, unfin_idx))

                if self.match_source_len and step > src_lengths[unfin_idx]:
                    score = -math.inf

                def get_hypo():

                    if attn_clone is not None:
                        # remove padding tokens from attn scores
                        hypo_attn = attn_clone[i]
                    else:
                        hypo_attn = None

                    return {
                        'tokens': tokens_clone[i],
                        'score': score,
                        'attention': hypo_attn,  # src_len x tgt_len
                        'alignment': None,
                        'positional_scores': pos_scores[i],
                    }

                if len(finalized[sent]) < beam_size:
                    finalized[sent].append(get_hypo())

            newly_finished = []
            for sent, unfin_idx in sents_seen:
                # check termination conditions for this sentence
                if not finished[sent] and is_finished(sent, step, unfin_idx):
                    finished[sent] = True
                    newly_finished.append(unfin_idx)
            return newly_finished

        reorder_state = None
        batch_idxs = None
        for step in range(max_len + 1):  # one extra step for EOS marker
            # reorder decoder internal states based on the prev choice of beams
            if reorder_state is not None:
                if batch_idxs is not None:
                    # update beam indices to take into account removed sentences
                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs)
                    reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size)
                model.reorder_incremental_state(reorder_state)
                encoder_outs = model.reorder_encoder_out(encoder_outs, reorder_state)        
            lprobs, avg_attn_scores = model.forward_decoder(
                tokens[:, :step + 1], encoder_outs, temperature=self.temperature, **kwargs
            )

            lprobs[:, self.pad] = -math.inf  # never select pad
            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty

            # handle min and max length constraints
            if step >= max_len:
                lprobs[:, :self.eos] = -math.inf
                lprobs[:, self.eos + 1:] = -math.inf
            elif step < self.min_len:
                lprobs[:, self.eos] = -math.inf

            # handle prefix tokens (possibly with different lengths)
            if prefix_tokens is not None and step < prefix_tokens.size(1):
                prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
                prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
                prefix_mask = prefix_toks.ne(self.pad)
                lprobs[prefix_mask] = -math.inf
                lprobs[prefix_mask] = lprobs[prefix_mask].scatter_(
                    -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs
                )
                # if prefix includes eos, then we should make sure tokens and
                # scores are the same across all beams
                eos_mask = prefix_toks.eq(self.eos)
                if eos_mask.any():
                    # validate that the first beam matches the prefix
                    first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1]
                    eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
                    target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
                    assert (first_beam == target_prefix).all()

                    def replicate_first_beam(tensor, mask):
                        tensor = tensor.view(-1, beam_size, tensor.size(-1))
                        tensor[mask] = tensor[mask][:, :1, :]
                        return tensor.view(-1, tensor.size(-1))

                    # copy tokens, scores and lprobs from the first beam to all beams
                    tokens = replicate_first_beam(tokens, eos_mask_batch_dim)
                    scores = replicate_first_beam(scores, eos_mask_batch_dim)
                    lprobs = replicate_first_beam(lprobs, eos_mask_batch_dim)

            if self.no_repeat_ngram_size > 0:
                # for each beam and batch sentence, generate a list of previous ngrams
                gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)]
                for bbsz_idx in range(bsz * beam_size):
                    gen_tokens = tokens[bbsz_idx].tolist()
                    for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]):
                        gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \
                                gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]]

            # Record attention scores
            if avg_attn_scores is not None:
                if attn is None:
                    attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2)
                    attn_buf = attn.clone()
                attn[:, :, step + 1].copy_(avg_attn_scores)

            scores = scores.type_as(lprobs)
            scores_buf = scores_buf.type_as(lprobs)
            eos_bbsz_idx = buffer('eos_bbsz_idx')
            eos_scores = buffer('eos_scores', type_of=scores)

            self.search.set_src_lengths(src_lengths)

            if self.no_repeat_ngram_size > 0:
                def calculate_banned_tokens(bbsz_idx):
                    # before decoding the next token, prevent decoding of ngrams that have already appeared
                    ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist())
                    return gen_ngrams[bbsz_idx].get(ngram_index, [])

                if step + 2 - self.no_repeat_ngram_size >= 0:
                    # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
                    banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)]
                else:
                    banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)]

                for bbsz_idx in range(bsz * beam_size):
                    lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf

            cand_scores, cand_indices, cand_beams = self.search.step(
                step,
                lprobs.view(bsz, -1, self.vocab_size),
                scores.view(bsz, beam_size, -1)[:, :, :step],
            )

            # cand_bbsz_idx contains beam indices for the top candidate
            # hypotheses, with a range of values: [0, bsz*beam_size),
            # and dimensions: [bsz, cand_size]
            cand_bbsz_idx = cand_beams.add(bbsz_offsets)

            # finalize hypotheses that end in eos (except for blacklisted ones)
            eos_mask = cand_indices.eq(self.eos)
            eos_mask[:, :beam_size][blacklist] = 0

            # only consider eos when it's among the top beam_size indices
            torch.masked_select(
                cand_bbsz_idx[:, :beam_size],
                mask=eos_mask[:, :beam_size],
                out=eos_bbsz_idx,
            )

            finalized_sents = set()
            if eos_bbsz_idx.numel() > 0:
                torch.masked_select(
                    cand_scores[:, :beam_size],
                    mask=eos_mask[:, :beam_size],
                    out=eos_scores,
                )
                finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores)
                num_remaining_sent -= len(finalized_sents)

            assert num_remaining_sent >= 0
            if num_remaining_sent == 0:
                break
            assert step < max_len

            if len(finalized_sents) > 0:
                new_bsz = bsz - len(finalized_sents)

                # construct batch_idxs which holds indices of batches to keep for the next pass
                batch_mask = cand_indices.new_ones(bsz)
                batch_mask[cand_indices.new(finalized_sents)] = 0
                batch_idxs = batch_mask.nonzero().squeeze(-1)

                eos_mask = eos_mask[batch_idxs]
                cand_beams = cand_beams[batch_idxs]
                bbsz_offsets.resize_(new_bsz, 1)
                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
                cand_scores = cand_scores[batch_idxs]
                cand_indices = cand_indices[batch_idxs]
                if prefix_tokens is not None:
                    prefix_tokens = prefix_tokens[batch_idxs]
                src_lengths = src_lengths[batch_idxs]
                blacklist = blacklist[batch_idxs]

                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
                scores_buf.resize_as_(scores)
                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
                tokens_buf.resize_as_(tokens)
                if attn is not None:
                    attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1)
                    attn_buf.resize_as_(attn)
                bsz = new_bsz
            else:
                batch_idxs = None

            # Set active_mask so that values > cand_size indicate eos or
            # blacklisted hypos and values < cand_size indicate candidate
            # active hypos. After this, the min values per row are the top
            # candidate active hypos.
            active_mask = buffer('active_mask')
            eos_mask[:, :beam_size] |= blacklist
            torch.add(
                eos_mask.type_as(cand_offsets) * cand_size,
                cand_offsets[:eos_mask.size(1)],
                out=active_mask,
            )

            # get the top beam_size active hypotheses, which are just the hypos
            # with the smallest values in active_mask
            active_hypos, new_blacklist = buffer('active_hypos'), buffer('new_blacklist')
            torch.topk(
                active_mask, k=beam_size, dim=1, largest=False,
                out=(new_blacklist, active_hypos)
            )

            # update blacklist to ignore any finalized hypos
            blacklist = new_blacklist.ge(cand_size)[:, :beam_size]
            assert (~blacklist).any(dim=1).all()

            active_bbsz_idx = buffer('active_bbsz_idx')
            torch.gather(
                cand_bbsz_idx, dim=1, index=active_hypos,
                out=active_bbsz_idx,
            )
            active_scores = torch.gather(
                cand_scores, dim=1, index=active_hypos,
                out=scores[:, step].view(bsz, beam_size),
            )

            active_bbsz_idx = active_bbsz_idx.view(-1)
            active_scores = active_scores.view(-1)

            # copy tokens and scores for active hypotheses
            torch.index_select(
                tokens[:, :step + 1], dim=0, index=active_bbsz_idx,
                out=tokens_buf[:, :step + 1],
            )
            torch.gather(
                cand_indices, dim=1, index=active_hypos,
                out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1],
            )
            if step > 0:
                torch.index_select(
                    scores[:, :step], dim=0, index=active_bbsz_idx,
                    out=scores_buf[:, :step],
                )
            torch.gather(
                cand_scores, dim=1, index=active_hypos,
                out=scores_buf.view(bsz, beam_size, -1)[:, :, step],
            )

            # copy attention for active hypotheses
            if attn is not None:
                torch.index_select(
                    attn[:, :, :step + 2], dim=0, index=active_bbsz_idx,
                    out=attn_buf[:, :, :step + 2],
                )

            # swap buffers
            tokens, tokens_buf = tokens_buf, tokens
            scores, scores_buf = scores_buf, scores
            if attn is not None:
                attn, attn_buf = attn_buf, attn

            # reorder incremental state in decoder
            reorder_state = active_bbsz_idx

        # sort by score descending
        for sent in range(len(finalized)):
            finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
        return finalized