def _get_single_item(self, index):

        start_ind, end_ind, pid, label, camid = self.seqset[index]

        imgseq = []
        flowseq = []
        for ind in range(start_ind, end_ind):
            fname = self.identities[pid][camid][ind]
            fpath_img = osp.join(self.root[0], fname)
            imgrgb = Image.open(fpath_img).convert('RGB')
            fpath_flow = osp.join(self.root[1], fname)
            flowrgb = Image.open(fpath_flow).convert('RGB')
            imgseq.append(imgrgb)
            flowseq.append(flowrgb)

        while len(imgseq) < self.seq_len:
            imgseq.append(imgrgb)
            flowseq.append(flowrgb)

        seq = [imgseq, flowseq]

        if self.transform is not None:
            seq = self.transform(seq)

        img_tensor = torch.stack(seq[0], 0)

        if len(self.root) == 2:
            flow_tensor = torch.stack(seq[1], 0)
        else:
            flow_tensor = None

        return img_tensor, flow_tensor, pid, camid
def lk_forward_backward_batch(features, locations, window, steps):
  sequence, C, H, W = list(features.size())
  seq, num_pts, _ = list(locations.size())
  assert seq == sequence, '{:} vs {:}'.format(features.size(), locations.size())

  previous_pts = [ locations[0] ]
  for iseq in range(1, sequence):
    feature_old = features.narrow(0, iseq-1, 1)
    feature_new = features.narrow(0, iseq  , 1)
    nextPts = lk_tensor_track_batch(feature_old, feature_new, previous_pts[iseq-1], window, steps, None)
    previous_pts.append(nextPts)

  fback_pts = [None] * (sequence-1) + [ previous_pts[-1] ]
  for iseq in range(sequence-2, -1, -1):
    feature_old = features.narrow(0, iseq+1, 1)
    feature_new = features.narrow(0, iseq  , 1)
    backPts = lk_tensor_track_batch(feature_old, feature_new, fback_pts[iseq+1]   , window, steps, None)
    fback_pts[iseq] = backPts

  back_pts = [None] * (sequence-1) + [ locations[-1] ]
  for iseq in range(sequence-2, -1, -1):
    feature_old = features.narrow(0, iseq+1, 1)
    feature_new = features.narrow(0, iseq  , 1)
    backPts = lk_tensor_track_batch(feature_old, feature_new, back_pts[iseq+1]    , window, steps, None)
    back_pts[iseq] = backPts

  return torch.stack(previous_pts), torch.stack(fback_pts), torch.stack(back_pts)
    def forward(self, hidden, encoder_outputs, attn_mask):

        # Create variable to store attention energies
        # hidden is 16 by 512
        # encoder_outputs is 16 by 72 by 512
        
        # this just uses the top layer of the 2-layer decoder. 
        # okay?
        hidden = hidden.squeeze(0)
        batch_size = hidden.size()[0]
        attn_energies = []
        for i in range(batch_size):
            attn_energies.append(self.score(hidden[i], encoder_outputs[i]))
        
        attn_energies = torch.stack(attn_energies).squeeze(0)
        # attn_energies is 32 by 72
        if attn_mask is not None:
            attn_energies = attn_mask * attn_energies
            attn_energies[attn_energies == 0] = -1e10
        # i want to mask the attention energies
        if attn_mask is None:
            attn_energies = attn_energies.view(1, -1)
        attn_energies = self.softmax(attn_energies)
        
        context_vectors = []
        for i in range(batch_size):
            context_vectors.append(torch.matmul(attn_energies[i], encoder_outputs[i]))
                
        context_vectors = torch.stack(context_vectors)
        
        return context_vectors
Beispiel #4
0
    def process_batch_for_length(self, sequences, c_sequences):
        """
        Assemble and pad data.
        """
        assert len(sequences) == len(c_sequences)
        lengths = Variable(self.tensor_type([len(seq) for seq in sequences]))
        max_length = max(len(seq) for seq in sequences)
        max_c_length = max(max(len(chars) for chars in seq)
                           for seq in c_sequences)

        def _padded(seq, max_length):
            _padded_seq = self.tensor_type(max_length).zero_()
            _padded_seq[:len(seq)] = self.tensor_type(seq)
            return _padded_seq
        sequences = Variable(torch.stack(
                [_padded(seq, max_length) for seq in sequences]))

        def _padded_char(seq, max_length, max_c_length):
            _padded = self.tensor_type(max_length, max_c_length).zero_()
            for ind, tok in enumerate(seq):
                _padded[ind, :len(tok)] = self.tensor_type(tok)
            return _padded

        c_sequences = Variable(torch.stack([
            _padded_char(seq, max_length, max_c_length)
            for seq in c_sequences]))

        return (sequences, c_sequences, lengths)
    def __getitem__(self, index):
        if self.mode == 'test':
            img_path, img_name = self.imgs[index]
            img = Image.open(os.path.join(img_path, img_name + '.jpg')).convert('RGB')
            if self.transform is not None:
                img = self.transform(img)
            return img_name, img

        img_path, mask_path = self.imgs[index]
        img = Image.open(img_path).convert('RGB')
        if self.mode == 'train':
            mask = sio.loadmat(mask_path)['GTcls']['Segmentation'][0][0]
            mask = Image.fromarray(mask.astype(np.uint8))
        else:
            mask = Image.open(mask_path)

        if self.joint_transform is not None:
            img, mask = self.joint_transform(img, mask)

        if self.sliding_crop is not None:
            img_slices, mask_slices, slices_info = self.sliding_crop(img, mask)
            if self.transform is not None:
                img_slices = [self.transform(e) for e in img_slices]
            if self.target_transform is not None:
                mask_slices = [self.target_transform(e) for e in mask_slices]
            img, mask = torch.stack(img_slices, 0), torch.stack(mask_slices, 0)
            return img, mask, torch.LongTensor(slices_info)
        else:
            if self.transform is not None:
                img = self.transform(img)
            if self.target_transform is not None:
                mask = self.target_transform(mask)
            return img, mask
    def _construct_previous(self, layer, direction, inputs, tree, idx):
        if direction == 'up':
            oidx = tree.children_idx(idx)
        else:
            oidx = tree.parents_idx(idx)

        if oidx:
            h_prev, c_prev = [], []

            for i in oidx:
                h_prev_i, c_prev_i = self._upward_downward(layer,
                                                           direction,
                                                           inputs,
                                                           tree, i)

                h_prev.append(h_prev_i)
                c_prev.append(c_prev_i)

            h_prev = torch.stack(h_prev, 1)
            c_prev = torch.stack(c_prev, 1)

        elif inputs.is_cuda:
            h_prev = torch.zeros(self.hidden_size, 1).cuda()
            c_prev = torch.zeros(self.hidden_size, 1).cuda()

        else:
            h_prev = torch.zeros(self.hidden_size, 1)
            c_prev = torch.zeros(self.hidden_size, 1)

        return oidx, (h_prev, c_prev)
Beispiel #7
0
def singleTagLoss(pred_tag, keypoints):
    """
    associative embedding loss for one image
    """
    eps = 1e-6
    tags = []
    pull = 0
    for i in keypoints:
        tmp = []
        for j in i:
            if j[1]>0:
                tmp.append(pred_tag[j[0]])
        if len(tmp) == 0:
            continue
        tmp = torch.stack(tmp)
        tags.append(torch.mean(tmp, dim=0))
        pull = pull +  torch.mean((tmp - tags[-1].expand_as(tmp))**2)

    if len(tags) == 0:
        return make_input(torch.zeros([1]).float()), make_input(torch.zeros([1]).float())

    tags = torch.stack(tags)[:,0]

    num = tags.size()[0]
    size = (num, num, tags.size()[1])
    A = tags.unsqueeze(dim=1).expand(*size)
    B = A.permute(1, 0, 2)

    diff = A - B
    diff = torch.pow(diff, 2).sum(dim=2)[:,:,0]
    push = torch.exp(-diff)
    push = (torch.sum(push) - num)
    return push/((num - 1) * num + eps) * 0.5, pull/(num + eps)
Beispiel #8
0
 def predict(self, x_de, x_en):
     bs = x_de.size(0)
     emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
     emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
     h = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
     c = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
     enc_h, _ = self.encoder(emb_de, (h, c))
     dec_h, _ = self.decoder(emb_en, (h, c))
     # all the same. enc_h is bs,n_de,hiddensz*n_directions. h and c are both n_layers*n_directions,bs,hiddensz
     if self.directions == 2:
         enc_h = self.dim_reduce(enc_h) # bs,n_de,hiddensz
     scores = torch.bmm(enc_h, dec_h.transpose(1,2))
     # (bs,n_de,hiddensz) * (bs,hiddensz,n_en) = (bs,n_de,n_en)
     y = [Variable(torch.cuda.LongTensor([sos_token]*bs))] # bs
     self.attn = []
     for t in range(x_en.size(1)-1): # iterate over english words, with teacher forcing
         attn_dist = F.softmax(scores[:,:,t],dim=1) # bs,n_de
         self.attn.append(attn_dist.data)
         if self.attn_type == "hard":
             _, argmax = attn_dist.max(1) # bs. for each batch, select most likely german word to pay attention to
             one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1).cuda())
             context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
         else:
             context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
         # the difference btwn hard and soft is just whether we use a one_hot or a distribution
         # context is bs,hiddensz
         pred = self.vocab_layer(torch.cat([dec_h[:,t,:], context], 1)) # bs,len(EN.vocab)
         _, next_token = pred.max(1) # bs
         y.append(next_token)
     self.attn = torch.stack(self.attn, 0).transpose(0, 1) # bs,n_en,n_de (for visualization!)
     y = torch.stack(y,0).transpose(0,1) # bs,n_en
     return y,self.attn
Beispiel #9
0
def random_sample(batch):
    imgids, sentids, imgfeats, textfeats = batch

    ### image as anchor
    anchor_img, positive_text, negative_text = [],[],[]
    for i,iid in enumerate(imgids):
        for j,iid2 in enumerate(imgids):
            if iid!=iid2:
                anchor_img.append(imgfeats[i])
                positive_text.append(textfeats[i])
                negative_text.append(textfeats[j])
    anchor_img, positive_text, negative_text = torch.stack(anchor_img), torch.stack(positive_text), torch.stack(negative_text)

    ### text as anchof
    anchor_text, positive_img, negative_img = [],[],[]
    for i,iid in enumerate(imgids):
        for j,iid2 in enumerate(imgids):
            if iid!=iid2:
                anchor_text.append(textfeats[i])
                positive_img.append(imgfeats[i])
                negative_img.append(imgfeats[j])
    anchor_text, positive_img, negative_img = torch.stack(anchor_text), torch.stack(positive_img), torch.stack(negative_img)
    positive_text = positive_text.type(torch.FloatTensor)
    negative_text = negative_text.type(torch.FloatTensor)

    return anchor_img, positive_text, negative_text, anchor_text, positive_img, negative_img
    def __getitem__(self, index):
        img_path, mask_path = self.imgs[index]
        img, mask = Image.open(img_path).convert('RGB'), Image.open(mask_path)

        mask = np.array(mask)
        mask_copy = mask.copy()
        for k, v in self.id_to_trainid.items():
            mask_copy[mask == k] = v
        mask = Image.fromarray(mask_copy.astype(np.uint8))

        if self.joint_transform is not None:
            img, mask = self.joint_transform(img, mask)
        if self.sliding_crop is not None:
            img_slices, mask_slices, slices_info = self.sliding_crop(img, mask)
            if self.transform is not None:
                img_slices = [self.transform(e) for e in img_slices]
            if self.target_transform is not None:
                mask_slices = [self.target_transform(e) for e in mask_slices]
            img, mask = torch.stack(img_slices, 0), torch.stack(mask_slices, 0)
            return img, mask, torch.LongTensor(slices_info)
        else:
            if self.transform is not None:
                img = self.transform(img)
            if self.target_transform is not None:
                mask = self.target_transform(mask)
            return img, mask
Beispiel #11
0
    def forward(self, z_seq, a_seq, term_seq):
        # x: [B,2,84,84]
        # T = x.size()[0]

        h = torch.zeros(1,self.h_size).cuda()
        z_losses = []
        term_losses = []
        for t in range(len(term_seq)-1):

            inter = self.encode_az(a_seq[t], z_seq[t])
            h = self.update_h(h, inter)
            z_pred, term_pred = self.predict_output(h, inter)

            z_loss = torch.mean((z_seq[t+1] - z_pred)**2)
            term_loss = F.binary_cross_entropy_with_logits(input=term_pred, target=term_seq[t+1])

            z_losses.append(z_loss)
            term_losses.append(term_loss)

        z_loss = torch.mean(torch.stack(z_losses))
        term_loss = torch.mean(torch.stack(term_losses)) 

        loss = z_loss + term_loss 

        return loss, z_loss, term_loss
Beispiel #12
0
def default_collate(batch):
    "Puts each data field into a tensor with outer dimension batch size"
    if torch.is_tensor(batch[0]):
        out = None
        if _use_shared_memory:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum([x.numel() for x in batch])
            storage = batch[0].storage()._new_shared(numel)
            out = batch[0].new(storage)
        return torch.stack(batch, 0, out=out)
    elif type(batch[0]).__module__ == 'numpy':
        elem = batch[0]
        if type(elem).__name__ == 'ndarray':
            return torch.stack([torch.from_numpy(b) for b in batch], 0)
        if elem.shape == ():  # scalars
            py_type = float if elem.dtype.name.startswith('float') else int
            return numpy_type_map[elem.dtype.name](list(map(py_type, batch)))
    elif isinstance(batch[0], int):
        return torch.LongTensor(batch)
    elif isinstance(batch[0], float):
        return torch.DoubleTensor(batch)
    elif isinstance(batch[0], string_classes):
        return batch
    elif isinstance(batch[0], collections.Mapping):
        return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
    elif isinstance(batch[0], collections.Sequence):
        transposed = zip(*batch)
        return [default_collate(samples) for samples in transposed]

    raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
                     .format(type(batch[0]))))
Beispiel #13
0
 def predict(self, x, attn_type = "hard"):
     #predict with greedy decoding
     emb = self.embedding(x)
     h = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
     c = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
     enc_h, _ = self.encoder(emb, (h, c))
     y = [Variable(torch.zeros(x.size(0)).long())]
     self.attn = []        
     for t in range(x.size(1)):
         emb_t = self.embedding(y[-1])
         dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c))
         scores = torch.bmm(enc_h, dec_h.transpose(1,2)).squeeze(2)
         attn_dist = F.softmax(scores, dim = 1)
         self.attn.append(attn_dist.data)
         if attn_type == "hard":
             _, argmax = attn_dist.max(1)
             one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1))
             context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)                    
         else:                
             context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
         pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1))
         _, next_token = pred.max(1)
         y.append(next_token)
     self.attn = torch.stack(self.attn, 0).transpose(0, 1)
     return torch.stack(y, 0).transpose(0, 1)
Beispiel #14
0
    def forward(self, input_):

        #init hidden state with xavier
        vert_state = torch.zeros(input_[0].size(1), self.vert_state_dim).cuda()
        edge_state = torch.zeros(input_[1].size(1), self.edge_state_dim).cuda()

        '''if self.gpu_mode >= 0:
            vert_state = torch.tensor(vert_state.cuda())
            edge_state = torch.tensor(edge_state.cuda())'''

        batch_size = input_[0].size(0)
        vert_input = input_[0]
        edge_input = input_[1]
        #print('vert and edge input', vert_input.size(), edge_input.size())
        vert_state_list = []
        edge_state_list = []
        #todo: can this be parallelized?
        for i in range(batch_size):
            torch.nn.init.xavier_uniform(vert_state)
            torch.nn.init.xavier_uniform(edge_state)
            vert_state = self.vert_gru(vert_input[i], vert_state)
            edge_state = self.edge_gru(edge_input[i], edge_state)

            #todo: check whether this way is correct, TF code uses a separate global var to keep hidden state
            for i in range(self.num_steps):
                edge_context = self.get_edge_context(edge_state, vert_state)
                vert_context = self.get_vert_context(vert_state, edge_state)

                edge_state = self.edge_gru(edge_context, edge_state)
                vert_state = self.vert_gru(vert_context, vert_state)

            vert_state_list.append(vert_state)
            edge_state_list.append(edge_state)

        return torch.stack(vert_state_list), torch.stack(edge_state_list)
    def rollouts_batch(self, batch):
        batch_size = batch.size()[0]
        batch_rest = batch.size()[1:]
        if batch_size == 1:
            obs_batch_v = batch.expand(batch_size * self.n_actions, *batch_rest)
        else:
            obs_batch_v = batch.unsqueeze(1)
            obs_batch_v = obs_batch_v.expand(batch_size, self.n_actions, *batch_rest)
            obs_batch_v = obs_batch_v.contiguous().view(-1, *batch_rest)
        actions = np.tile(np.arange(0, self.n_actions, dtype=np.int64), batch_size)
        step_obs, step_rewards = [], []

        for step_idx in range(self.rollout_steps):
            actions_t = torch.tensor(actions).to(batch.device)
            obs_next_v, reward_v = self.net_em(obs_batch_v, actions_t)
            step_obs.append(obs_next_v.detach())
            step_rewards.append(reward_v.detach())
            # don't need actions for the last step
            if step_idx == self.rollout_steps-1:
                break
            # combine the delta from EM into new observation
            cur_plane_v = obs_batch_v[:, 1:2]
            new_plane_v = cur_plane_v + obs_next_v
            obs_batch_v = torch.cat((cur_plane_v, new_plane_v), dim=1)
            # select actions
            logits_v, _ = self.net_policy(obs_batch_v)
            probs_v = F.softmax(logits_v, dim=1)
            probs = probs_v.data.cpu().numpy()
            actions = self.action_selector(probs)
        step_obs_v = torch.stack(step_obs)
        step_rewards_v = torch.stack(step_rewards)
        flat_enc_v = self.encoder(step_obs_v, step_rewards_v)
        return flat_enc_v.view(batch_size, -1)
Beispiel #16
0
 def encode(self, article, art_lens=None):
     size = (
         self._init_enc_h.size(0),
         len(art_lens) if art_lens else 1,
         self._init_enc_h.size(1)
     )
     init_enc_states = (
         self._init_enc_h.unsqueeze(1).expand(*size),
         self._init_enc_c.unsqueeze(1).expand(*size)
     )
     enc_art, final_states = lstm_encoder(
         article, self._enc_lstm, art_lens,
         init_enc_states, self._embedding
     )
     if self._enc_lstm.bidirectional:
         h, c = final_states
         final_states = (
             torch.cat(h.chunk(2, dim=0), dim=2),
             torch.cat(c.chunk(2, dim=0), dim=2)
         )
     init_h = torch.stack([self._dec_h(s)
                           for s in final_states[0]], dim=0)
     init_c = torch.stack([self._dec_c(s)
                           for s in final_states[1]], dim=0)
     init_dec_states = (init_h, init_c)
     attention = torch.matmul(enc_art, self._attn_wm).transpose(0, 1)
     init_attn_out = self._projection(torch.cat(
         [init_h[-1], sequence_mean(attention, art_lens, dim=1)], dim=1
     ))
     return attention, (init_dec_states, init_attn_out)
Beispiel #17
0
    def forward(self, y_pred, y_true, eps=1e-6):
        return NotImplementedError

        torch.nn.modules.loss._assert_no_grad(y_true)

        assert y_pred.shape[1] == 2

        same_left = torch.stack([y_true[:, 0], y_pred[:, 0]], dim=1)
        same_left, _ = torch.max(same_left, dim=1)

        same_right = torch.stack([y_true[:, 1], y_pred[:, 1]], dim=1)
        same_right, _ = torch.min(same_right, dim=1)

        same_len = same_right - same_left + 1   # (batch_size,)
        same_len = torch.stack([same_len, torch.zeros_like(same_len)], dim=1)
        same_len, _ = torch.max(same_len, dim=1)

        same_len = same_len.type(torch.float)

        pred_len = (y_pred[:, 1] - y_pred[:, 0] + 1).type(torch.float)
        true_len = (y_true[:, 1] - y_true[:, 0] + 1).type(torch.float)

        pre = same_len / (pred_len + eps)
        rec = same_len / (true_len + eps)

        f1 = 2 * pre * rec / (pre + rec + eps)

        return -torch.mean(f1)
Beispiel #18
0
    def setUp(self, size=(2, 5), batch=3, dtype=torch.float64, device=None,
              seed=None, mu=None, cov=None, A=None, b=None):
        '''Test the correctness of batch implementation of mean().

        This function will stack `[1 * mu, 2 * mu, ..., batch * mu]`.
        Then, it will see whether the batch output is accurate or not.

        Args:
            size: Tuple size of matrix A.
            batch: The batch size > 0.
            dtype: data type.
            device: In which device.
            seed: Seed for the random number generator.
            mu: To test a specific mean mu.
            cov: To test a specific covariance matrix.
            A: To test a specific A matrix.
            b: To test a specific bias b.
        '''
        if seed is not None:
            torch.manual_seed(seed)
        if A is None:
            A = torch.rand(size, dtype=dtype, device=device)
        if b is None:
            b = torch.rand(size[0], dtype=dtype, device=device)
        if mu is None:
            mu = torch.rand(size[1], dtype=dtype, device=device)
        if cov is None:
            cov = rand.definite(size[1], dtype=dtype, device=device,
                                positive=True, semi=False, norm=10**2)
        self.A = A
        self.b = b
        var = torch.diag(cov)
        self.batch_mean = torch.stack([(i + 1) * mu for i in range(batch)])
        self.batch_cov = torch.stack([(i + 1) * cov for i in range(batch)])
        self.batch_var = torch.stack([(i + 1) * var for i in range(batch)])
Beispiel #19
0
def plot_rec(x, netEC, netEP, netD):
    x_c = x[0]
    x_p = x[np.random.randint(1, opt.max_step)]

    h_c = netEC(x_c)
    h_p = netEP(x_p)

    # print('h_c shape: ', h_c.shape)
    # print('h p shape: ', h_p.shape)
    rec = netD([h_c, h_p])

    x_c, x_p, rec = x_c.data, x_p.data, rec.data
    fname = '%s/rec/rec_test.png' % (opt.log_dir)

    comparison = None
    for i in range(len(x_c)):
        if comparison is None:
            comparison = torch.stack([x_c[i], x_p[i], rec[i]])
        else:
            new_comparison = torch.stack([x_c[i], x_p[i], rec[i]])
            comparison = torch.cat([comparison, new_comparison])
    print('comparison: ', comparison.shape)

    # row_sz = 5
    # nplot = 20
    # for i in range(0, nplot - row_sz, row_sz):
    #     row = [[xc, xp, xr] for xc, xp, xr in zip(x_c[i:i + row_sz], x_p[i:i + row_sz], rec[i:i + row_sz])]
    #     print('row: ', row)
    #     to_plot.append(list(itertools.chain(*row)))
    # print(len(to_plot[0]))
    # utils.save_tensors_image(fname, comparison)
    if not os.path.exists(os.path.dirname(fname)):
        os.makedirs(os.path.dirname(fname))
    save_image(comparison.cpu(), fname, nrow=3)
Beispiel #20
0
    def collate_fn(self, batch):
        '''Pad images and encode targets.

        As for images are of different sizes, we need to pad them to the same size.

        Args:
          batch: (list) of images, cls_targets, loc_targets.

        Returns:
          padded images, stacked cls_targets, stacked loc_targets.
        '''
        imgs = [x[0] for x in batch]
        boxes = [x[1] for x in batch]
        labels = [x[2] for x in batch]

        h = w = self.input_size
        num_imgs = len(imgs)
        inputs = torch.zeros(num_imgs, 3, h, w)

        loc_targets = []
        cls_targets = []
        for i in range(num_imgs):
            inputs[i] = imgs[i]
            loc_target, cls_target = self.encoder.encode(boxes[i], labels[i], input_size=(w,h))
            loc_targets.append(loc_target)
            cls_targets.append(cls_target)
        return inputs, torch.stack(loc_targets), torch.stack(cls_targets)
 def adpW(self,x):
     '''
        calculate the pairwise_att of everypair of inputs
        output_size: (x.size(0),x.size(1)/2)
     '''
     x = x.detach()
     x = self.adp_metric_embedding1(x)
     x = self.adp_metric_embedding1_bn(x)
     x = F.relu(x)
     x = self.adp_metric_embedding2(x)
     x = self.adp_metric_embedding2_bn(x)
     x = F.relu(x)
     x = self.adp_metric_embedding3(x)
     x = self.adp_metric_embedding3_bn(x)
     x = F.relu(x)
     pairwise_att = F.sigmoid(self.adp_metric_embedding4(x))
     # x = self.adp_metric_embedding2_bn(x)
     diag_matrix1 = []
     diag_matrix2 = []
     for i in range(x.size(0)):
         diag_matrix1.append(torch.diag(pairwise_att[i, :x.size(1)/2]))
     for i in range(x.size(0)):
         diag_matrix2.append(torch.diag(pairwise_att[i, x.size(1)/2:]))
     pairwise_att1 = torch.stack(diag_matrix1)
     pairwise_att2 = torch.stack(diag_matrix1)
     return pairwise_att1, pairwise_att2
    def forward(self, inputs=None, encoder_hidden=None, encoder_outputs=None,
                pg_encoder_states=None, function=F.log_softmax, teacher_forcing_ratio=0, context_embedding=None):
        ret_dict = dict()
        ret_dict[DecoderRNNFB.KEY_ATTN_SCORE] = list()

        inputs, batch_size, max_length = self._validate_args(inputs,
                                    encoder_hidden, encoder_outputs,
                                    function, teacher_forcing_ratio)

        decoder_hidden = self._init_state(encoder_hidden)

        use_teacher_forcing = True if teacher_forcing_ratio == 1 else False

        if use_teacher_forcing:
            decoder_input = inputs[:, :-1]
            decoder_outputs, decoder_output_states, decoder_hidden, attn = \
                self.forward_step(decoder_input, pg_encoder_states,
                                decoder_hidden, encoder_outputs, context_embedding)
        else:
            decoder_outputs = []
            decoder_output_states = []
            sequence_symbols = []
            lengths = np.array([max_length] * batch_size)

            def decode(step, step_output, step_output_state=None, step_attn=None):
                if step_output_state is not None:
                    decoder_outputs.append(step_output)
                    decoder_output_states.append(step_output_state)
                ret_dict[DecoderRNNFB.KEY_ATTN_SCORE].append(step_attn)
                symbols = step_output.topk(1)[1]
                sequence_symbols.append(symbols)

                eos_batches = symbols.data.eq(self.eos_id)
                if eos_batches.dim() > 0:
                    eos_batches = eos_batches.cpu().view(-1).numpy()
                    update_idx = ((lengths > step) & eos_batches) != 0
                    lengths[update_idx] = len(sequence_symbols)
                return symbols

            decoder_input = inputs[:, 0].unsqueeze(1)
            for di in range(max_length):

                decoder_output, decoder_output_state, decoder_hidden, step_attn = \
                    self.forward_step(decoder_input, pg_encoder_states, decoder_hidden,
                                      encoder_outputs, context_embedding)
                # # not allow decoder to output UNK
                decoder_output[:, :, 3] = -float('inf')

                step_output = decoder_output.squeeze(1)
                step_output_state = decoder_output_state.squeeze(1)
                symbols = decode(di, step_output, step_output_state, step_attn)
                decoder_input = symbols

            decoder_outputs = torch.stack(decoder_outputs, dim=1)
            decoder_output_states = torch.stack(decoder_output_states, dim=1)
            ret_dict[DecoderRNNFB.KEY_SEQUENCE] = sequence_symbols
            ret_dict[DecoderRNNFB.KEY_LENGTH] = lengths.tolist()

        return decoder_outputs, decoder_output_states, ret_dict
 def backward(ctx, grad_output):
     input,         = ctx.saved_tensors
     grad_input     = torch.stack((grad_output, torch.zeros_like(grad_output)), dim=len(grad_output.shape))
     phase_input    = angle(input)
     phase_input    = torch.stack((torch.cos(phase_input), torch.sin(phase_input)), dim=len(grad_output.shape))
     grad_input     = multiply_complex(phase_input, grad_input)
     
     return 0.5*grad_input
Beispiel #24
0
    def collate_fn(self, data):
        x, y, lens = zip(*data)
        max_len = max(lens)
        x = torch.stack(x)[:, :max_len]
        y = torch.stack(y)[:, :max_len]
        lens = torch.tensor(lens)

        return x, y, lens
Beispiel #25
0
def post_process_latents(latents):
    z_where, z_pres = latents
    z_where = [z.cpu() for z in z_where]
    z_pres = [z.cpu() for z in z_pres]
    z_where_t = torch.stack(z_where).transpose(0, 1)
    z_pres_t = torch.stack(z_pres).transpose(0, 1)
    out = []
    for z_where_i, z_pres_i in zip(z_where_t, z_pres_t):
        out.append([z_obj._make(torch.cat([zw.data, zp.data])) for zw, zp in zip(z_where_i, z_pres_i)])
    return out
Beispiel #26
0
def predict_batchwise(model, dataloader):
    with torch.no_grad():
        X, Y = zip(*[
            [x, y] for X, Y in dataloader
                for x, y in zip(
                    model(X.cuda()).cpu(), 
                    Y
                )
        ])
    return torch.stack(X), torch.stack(Y)
Beispiel #27
0
    def collate_fn(self, data):
        x, y, lens = zip(
            *sorted(data, key=lambda x: x[-1], reverse=True)
        )
        max_len = lens[0]
        x = torch.stack(x)[:, :max_len]
        y = torch.stack(y)[:, :max_len]
        lens = torch.tensor(lens)

        return x, y, lens
Beispiel #28
0
    def update(self, done):
        # print(self.action_indices)
        if done:
            # without bootstrap
            R_rev = [Variable(T.from_numpy(np.zeros(1, dtype=np.float32)))]
            A_rev = [Variable(T.from_numpy(np.zeros(1, dtype=np.float32)))]
            self.V_preds.append(0)
        else:
            # with bootstrap
            R_rev = [self.V_preds[-1]]
            A_rev = [Variable(T.from_numpy(np.zeros(1, dtype=np.float32)))]
            self.R_preds = self.R_preds[:-1]  # delete bootstrap element
            self.rewards = self.rewards[:-1]
        
        # accumulated rewards
        r_rev = self.rewards[::-1]
        for r in r_rev:
            R_rev.append(r + GAMMA*R_rev[-1])
        R = T.stack(R_rev[1:][::-1])   # (TRAIN_INTERVAL, 1)

        # advantages
        N = len(r_rev)
        assert len(self.V_preds) == N+1
        for i in range(N):
            delta = r_rev[i] + GAMMA*self.V_preds[N-i] - self.V_preds[N-i-1]
            A_rev.append(delta + GAMMA*LAMBDA*A_rev[-1])
        A = T.stack(A_rev[1:][::-1])

        # MBP loss
        V_preds = T.stack(self.V_preds[:-1])
        R_preds = T.stack(self.R_preds)
        #assert len(R) == len(R_preds) == len(V_preds) == len(A)
        R_loss = (T.sum((V_preds - R)*(V_preds - R)) + T.sum((R_preds - R)*(R_preds - R))) / 2.
        self.mbp_loss = self.mbp_loss + (ALPHA_RETURN * R_loss)
        self.mbp_loss = self.mbp_loss * ETA_MBP

        # Policy gradient
        A_ = 0
        H = 0
        self.actions = self.actions[1:]  # delete initial action
        for i in range(N):
            log_pi = self.log_pies[i]
            # log_pi*T.from_numpy(np.array(self.actions[i]==1).astype("float32"))
            # A_ += A[i] * log_pi[self.actions[i]==1]
            _t = T.sum(log_pi*T.from_numpy(np.array(self.actions[i]==1).astype("float32"))).view(1)
            A_ = A_ + (A[i] * _t)
            H = H - T.matmul(T.exp(log_pi), log_pi)
        self.policy_loss = self.policy_loss + (A_[0] + ALPHA_ENTROPY*H )    # gradient ascend
        self.policy_loss = self.policy_loss * ETA_POLICY

        # update
        self.mbp_loss_log.append(self.mbp_loss.data)
        self.policy_loss_log.append(self.policy_loss.data)
        print("(mbp loss, policy loss): ", self.mbp_loss, self.policy_loss)
        return self.mbp_loss + self.policy_loss
Beispiel #29
0
def tagLoss(tags, keypoints):
    """
    accumulate the tag loss for each image in the batch
    """
    pushes, pulls = [], []
    keypoints = keypoints.cpu().data.numpy()
    for i in range(tags.size()[0]):
        push, pull = singleTagLoss(tags[i], keypoints[i%len(keypoints)])
        pushes.append(push)
        pulls.append(pull)
    return torch.stack(pushes), torch.stack(pulls)
Beispiel #30
0
    def custom_collate_fn(batch):
        batch = zip(*batch)  # transpose

        image, label, attributes, \
        num_nonzero_attributes = batch

        image = torch.stack(image)
        label = torch.LongTensor(label)
        attributes = torch.stack([torch.LongTensor(a) for a in attributes])
        padding_idx = torch.LongTensor(num_nonzero_attributes)

        return image, label, attributes, padding_idx
Beispiel #31
0
    def forward(self, inputs):
        x = inputs

        # input shape: b,c,h,2w
        batch_size, c, h, w = x.size(0), x.size(1), x.size(2), x.size(3) // 2
        block_size = h // self.scale

        value = self.f_value(x)
        query = self.f_query(x)
        key = self.f_key(x)

        value = torch.stack([value[:, :, :, :w], value[:, :, :, w:]],
                            4)  # B*N*H*W*2
        query = torch.stack([query[:, :, :, :w], query[:, :, :, w:]],
                            4)  # B*N*H*W*2
        key = torch.stack([key[:, :, :, :w], key[:, :, :, w:]], 4)  # B*N*H*W*2

        v_list = torch.split(value, block_size, dim=2)
        v_locals = torch.cat(v_list, 0)
        v_list = torch.split(v_locals, block_size, dim=3)
        v_locals = torch.cat(v_list)

        q_list = torch.split(query, block_size, dim=2)
        q_locals = torch.cat(q_list, 0)
        q_list = torch.split(q_locals, block_size, dim=3)
        q_locals = torch.cat(q_list)

        k_list = torch.split(key, block_size, dim=2)
        k_locals = torch.cat(k_list, 0)
        k_list = torch.split(k_locals, block_size, dim=3)
        k_locals = torch.cat(k_list)

        #  self-attention func
        def func(value_local, query_local, key_local):
            batch_size_new = value_local.size(0)
            h_local, w_local = value_local.size(2), value_local.size(3)
            value_local = value_local.contiguous().view(
                batch_size_new, self.in_dim, -1)

            query_local = query_local.contiguous().view(
                batch_size_new, self.in_dim, -1)
            query_local = query_local.permute(0, 2, 1)
            key_local = key_local.contiguous().view(batch_size_new,
                                                    self.in_dim, -1)

            sim_map = torch.bmm(query_local, key_local)
            sim_map = self.softmax(sim_map)

            context_local = torch.bmm(value_local, sim_map.permute(0, 2, 1))
            context_local = context_local.view(batch_size_new, self.in_dim,
                                               h_local, w_local, 2)
            return context_local

        context_locals = func(v_locals, q_locals, k_locals)

        b, c, h, w, _ = context_locals.shape

        context_list = torch.split(context_locals, b // self.scale, 0)
        context = torch.cat(context_list, dim=3)
        context_list = torch.split(context, b // self.scale // self.scale, 0)
        context = torch.cat(context_list, dim=2)

        context = torch.cat([context[:, :, :, :, 0], context[:, :, :, :, 1]],
                            3)

        return context + x
def main():
    ################ load ###################
    actor_path = os.path.abspath(
        os.curdir) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_actor.pkl'
    critic_path = os.path.abspath(
        os.curdir
    ) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_critic.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists(critic_path):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    critic_next = Critic(state_size, action_size).to(device)
    critic_next.load_state_dict(critic.state_dict())
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    episode = 2548  #540
    training_stage = 80  #100#80
    Decay = training_stage * 18

    lr = 0.0001
    sample_lr = [
        0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
        0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
        0.000004, 0.000003, 0.000002, 0.000001
    ]  #900 960 1020 1080 1140
    if episode >= training_stage:  #50 100
        try:
            lr = sample_lr[int(episode // training_stage)]
        except (IndexError):
            lr = 0.000001 * (0.9**((episode - Decay // training_stage))
                             )  #100-1800#80-1440#65-1170#570 -- 30

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic.parameters(), lr,
                            betas=(0.95, 0.999))  #,weight_decay=0.0001

    test = "GAMA"
    state, reward, done, time_pass, over, _ = GAMA_connect(test)  #connect
    print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1

        #最初の時
        else:
            print('Iteration:', episode, "lr:", lr)
            state = np.reshape(state, (1, len(state)))  #xxx
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device) / 255
            state = torch.DoubleTensor(state).reshape(1, state_size).to(device)

            for _ in range(Memory_size):
                memory.states.append(state)
                memory.states_img.append(tensor_cv)
            state = torch.stack(memory.states).to(device).detach()  ###
            tensor_cv = torch.stack(memory.states_img).to(device).detach()
            value, h_state_cv_c, h_state_n_c, h_state_3_c = critic(
                state,
                tensor_cv)  #dist,  # now is a tensoraction = dist.sample()
            action, log_prob, entropy = actor(
                state, tensor_cv)  #, h_state_cv_a,h_state_n_a,h_state_3_a
            print("acceleration: ", action.cpu().numpy())
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
            log_prob = log_prob.unsqueeze(0)
            #entropy += entropy

        state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect(
            test)
    return None
# also use numpy-style advanced indexing
x = torch.arange(9).view(3, 3)
indices = torch.LongTensor([0, 2])

print(x[indices])
print("-" * 20)
print(x[indices, :])
print("-" * 20)
print(x[:, indices])

# We can combine tensors by concatenating them. First, concatenating on the rows
x = torch.arange(6).view(2, 3)
describe(x)
describe(torch.cat([x, x], dim=0))
describe(torch.cat([x, x], dim=1))
describe(torch.stack([x, x]))

# We can concentate along the first dimension(the columns direction)
x = torch.arange(9).view(3, 3)
print(x)
print("-" * 20)
new_x = torch.cat([x, x, x], dim=1)
print(new_x.shape)
print(new_x)

# We can also concatenate on a new 0th dimension to "stack" the tensors
x = torch.arange(9).view(3, 3)
print(x)
print("-" * 20)
new_x = torch.stack([x, x, x])
print(new_x.shape)
def dnee_ee_features(rels,
                     model,
                     config,
                     pred2idx,
                     argw2idx,
                     max_event_len,
                     rel2idx,
                     device=None):
    x1_idx = 0
    x2_idx = 0
    gold2e1xs = {}
    gold2e2xs = {}
    x1s, x2s = [], []
    arg_lens = [config['arg0_max_len'], config['arg1_max_len']]
    for i_rel, rel in enumerate(rels):
        s = rel['Sense'][0]
        if len(rel['Arg1']['Events']) == 0:
            continue

        e1s = unique_event_dict(rel['Arg1']['Events'], pred2idx).values()
        for e1 in e1s:
            e1r = get_raw_event_repr(e1, config, pred2idx, argw2idx, device)
            x1s.append(e1r)
            if i_rel in gold2e1xs:
                gold2e1xs[i_rel].append(x1_idx)
            else:
                gold2e1xs[i_rel] = [x1_idx]
            x1_idx += 1

        e2s = unique_event_dict(rel['Arg2']['Events'], pred2idx).values()
        for e2 in e2s:
            e2r = get_raw_event_repr(e2, config, pred2idx, argw2idx, device)
            x2s.append(e2r)
            if i_rel in gold2e2xs:
                gold2e2xs[i_rel].append(x2_idx)
            else:
                gold2e2xs[i_rel] = [x2_idx]
            x2_idx += 1

    x1s = torch.stack(x1s, dim=0).squeeze()
    x2s = torch.stack(x2s, dim=0).squeeze()
    if device:
        x1s = x1s.to(device)
        x2s = x2s.to(device)
    with torch.no_grad():
        x1ee = model.embed_event(x1s)
        x2ee = model.embed_event(x2s)

    x1_out = torch.zeros((len(rels), max_event_len, x1ee.shape[1]),
                         dtype=torch.float32)
    x2_out = torch.zeros((len(rels), max_event_len, x2ee.shape[1]),
                         dtype=torch.float32)
    y = torch.LongTensor(len(rels))
    if device:
        x1_out = x1_out.to(device)
        x2_out = x2_out.to(device)
        y = y.to(device)
    for i_rel, rel in enumerate(rels):
        s = rel['Sense'][0]
        y[i_rel] = rel2idx[s]

        # combine scores for multiple event pairs
        if i_rel in gold2e1xs:
            idxs = gold2e1xs[i_rel]
            fs = x1ee[idxs, :]
            if fs.shape[0] > max_event_len:
                fs = fs[:max_event_len, :]
            x1_out[i_rel, :fs.shape[0]] = fs

        if i_rel in gold2e2xs:
            idxs = gold2e2xs[i_rel]
            fs = x2ee[idxs, :]
            if fs.shape[0] > max_event_len:
                fs = fs[:max_event_len, :]
            x2_out[i_rel, :fs.shape[0]] = fs
    return x1_out, x2_out, y
Beispiel #35
0
def spherical_to_cartesian(rtp):
    x = rtp[:, 0] * torch.sin(rtp[:, 1]) * torch.cos(rtp[:, 2])
    y = rtp[:, 0] * torch.sin(rtp[:, 1]) * torch.sin(rtp[:, 2])
    z = rtp[:, 0] * torch.cos(rtp[:, 1])
    return torch.stack((x, y, z), 1)
Beispiel #36
0
    def forward(self, xs_pad, ilens, ys_pad):
        """E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad:
            batch of padded character id sequence tensor (B, num_spkrs, Lmax)
        :return: ctc loss value
        :rtype: torch.Tensor
        :return: attention loss value
        :rtype: torch.Tensor
        :return: accuracy in attention decoder
        :rtype: float
        """
        # 0. Frontend
        if self.frontend is not None:
            hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
            if isinstance(hs_pad, list):
                hlens_n = [None] * self.num_spkrs
                for i in range(self.num_spkrs):
                    hs_pad[i], hlens_n[i] = self.feature_transform(
                        hs_pad[i], hlens)
                hlens = hlens_n
            else:
                hs_pad, hlens = self.feature_transform(hs_pad, hlens)
        else:
            hs_pad, hlens = xs_pad, ilens

        # 1. Encoder
        if not isinstance(
                hs_pad, list
        ):  # single-channel input xs_pad (single- or multi-speaker)
            hs_pad, hlens, _ = self.enc(hs_pad, hlens)
        else:  # multi-channel multi-speaker input xs_pad
            for i in range(self.num_spkrs):
                hs_pad[i], hlens[i], _ = self.enc(hs_pad[i], hlens[i])

        # 2. CTC loss
        if self.mtlalpha == 0:
            loss_ctc, min_perm = None, None
        else:
            if not isinstance(hs_pad, list):  # single-speaker input xs_pad
                loss_ctc = torch.mean(self.ctc(hs_pad, hlens, ys_pad))
            else:  # multi-speaker input xs_pad
                ys_pad = ys_pad.transpose(0, 1)  # (num_spkrs, B, Lmax)
                loss_ctc_perm = torch.stack(
                    [
                        self.ctc(
                            hs_pad[i // self.num_spkrs],
                            hlens[i // self.num_spkrs],
                            ys_pad[i % self.num_spkrs],
                        ) for i in range(self.num_spkrs**2)
                    ],
                    dim=1,
                )  # (B, num_spkrs^2)
                loss_ctc, min_perm = self.pit.pit_process(loss_ctc_perm)
                logging.info("ctc loss:" + str(float(loss_ctc)))

        # 3. attention loss
        if self.mtlalpha == 1:
            loss_att = None
            acc = None
        else:
            if not isinstance(hs_pad, list):  # single-speaker input xs_pad
                loss_att, acc, _ = self.dec(hs_pad, hlens, ys_pad)
            else:
                for i in range(ys_pad.size(1)):  # B
                    ys_pad[:, i] = ys_pad[min_perm[i], i]
                rslt = [
                    self.dec(hs_pad[i], hlens[i], ys_pad[i], strm_idx=i)
                    for i in range(self.num_spkrs)
                ]
                loss_att = sum([r[0] for r in rslt]) / float(len(rslt))
                acc = sum([r[1] for r in rslt]) / float(len(rslt))
        self.acc = acc

        # 5. compute cer/wer
        if (self.training or not (self.report_cer or self.report_wer)
                or not isinstance(hs_pad, list)):
            cer, wer = 0.0, 0.0
            # oracle_cer, oracle_wer = 0.0, 0.0
        else:
            if self.recog_args.ctc_weight > 0.0:
                lpz = [
                    self.ctc.log_softmax(hs_pad[i]).data
                    for i in range(self.num_spkrs)
                ]
            else:
                lpz = None

            word_eds, char_eds, word_ref_lens, char_ref_lens = [], [], [], []
            nbest_hyps = [
                self.dec.recognize_beam_batch(
                    hs_pad[i],
                    torch.tensor(hlens[i]),
                    lpz[i],
                    self.recog_args,
                    self.char_list,
                    self.rnnlm,
                    strm_idx=i,
                ) for i in range(self.num_spkrs)
            ]
            # remove <sos> and <eos>
            y_hats = [[
                nbest_hyp[0]["yseq"][1:-1] for nbest_hyp in nbest_hyps[i]
            ] for i in range(self.num_spkrs)]
            for i in range(len(y_hats[0])):
                hyp_words = []
                hyp_chars = []
                ref_words = []
                ref_chars = []
                for ns in range(self.num_spkrs):
                    y_hat = y_hats[ns][i]
                    y_true = ys_pad[ns][i]

                    seq_hat = [
                        self.char_list[int(idx)] for idx in y_hat
                        if int(idx) != -1
                    ]
                    seq_true = [
                        self.char_list[int(idx)] for idx in y_true
                        if int(idx) != -1
                    ]
                    seq_hat_text = "".join(seq_hat).replace(
                        self.recog_args.space, " ")
                    seq_hat_text = seq_hat_text.replace(
                        self.recog_args.blank, "")
                    seq_true_text = "".join(seq_true).replace(
                        self.recog_args.space, " ")

                    hyp_words.append(seq_hat_text.split())
                    ref_words.append(seq_true_text.split())
                    hyp_chars.append(seq_hat_text.replace(" ", ""))
                    ref_chars.append(seq_true_text.replace(" ", ""))

                tmp_word_ed = [
                    editdistance.eval(hyp_words[ns // self.num_spkrs],
                                      ref_words[ns % self.num_spkrs])
                    for ns in range(self.num_spkrs**2)
                ]  # h1r1,h1r2,h2r1,h2r2
                tmp_char_ed = [
                    editdistance.eval(hyp_chars[ns // self.num_spkrs],
                                      ref_chars[ns % self.num_spkrs])
                    for ns in range(self.num_spkrs**2)
                ]  # h1r1,h1r2,h2r1,h2r2

                word_eds.append(
                    self.pit.min_pit_sample(torch.tensor(tmp_word_ed))[0])
                word_ref_lens.append(len(sum(ref_words, [])))
                char_eds.append(
                    self.pit.min_pit_sample(torch.tensor(tmp_char_ed))[0])
                char_ref_lens.append(len("".join(ref_chars)))

            wer = (0.0 if not self.report_wer else float(sum(word_eds)) /
                   sum(word_ref_lens))
            cer = (0.0 if not self.report_cer else float(sum(char_eds)) /
                   sum(char_ref_lens))

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = None
        elif alpha == 1:
            self.loss = loss_ctc
            loss_att_data = None
            loss_ctc_data = float(loss_ctc)
        else:
            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
            loss_att_data = float(loss_att)
            loss_ctc_data = float(loss_ctc)

        loss_data = float(self.loss)
        if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data):
            self.reporter.report(loss_ctc_data, loss_att_data, acc, cer, wer,
                                 loss_data)
        else:
            logging.warning("loss (=%f) is not correct", loss_data)
        return self.loss
Beispiel #37
0
def run_mdnet(**opts):
    img_list = opts['img_list']
    gt = opts['gt']
    # init bounding box
    target_bb = np.array(opts['init_bb'])
    # a bounding box per image
    result = np.zeros((len(img_list), 4))
    result_bb = np.zeros((len(img_list), 4))
    # first image
    result[0] = np.copy(target_bb)
    result_bb[0] = np.copy(target_bb)

    iou_result = np.zeros((len(img_list), 1))

    # init model
    model = MDNet(opts['model_path'])
    model_g = NetG()
    if opts['adaptive_align']:
        align_h = model.roi_align_model.aligned_height
        align_w = model.roi_align_model.aligned_width
        spatial_s = model.roi_align_model.spatial_scale
        model.roi_align_model = RoIAlignAdaMax(align_h, align_w, spatial_s)
    if opts['use_gpu']:
        model = model.cuda()
        model_g = model_g.cuda()

    model.set_learnable_params(opts['ft_layers'])
    model_g.set_learnable_params(opts['ft_layers'])

    # init image crop model
    img_crop_model = ImgCropper(1.)
    if opts['use_gpu']:
        img_crop_model.gpu_enable()

    # init criterion and optimizer
    criterion = BinaryLoss()
    #criterion_g = nn.MSELoss(reduction='sum')
    criterion_g = nn.MSELoss(reduction='mean')
    init_optimizer = set_optimizer(model, opts['lr_init'], lr_mult=opts['lr_mult'], momentum=opts['momentum'],
                                   w_decay=opts['w_decay'])
    update_optimizer = set_optimizer(model, opts['lr_update'], lr_mult=opts['lr_mult'], momentum=opts['momentum'],
                                     w_decay=opts['w_decay'])

    tic = time.time()
    # Load first image
    cur_image = Image.open(img_list[0]).convert('RGB')
    cur_image = np.asarray(cur_image)

    # Draw pos/neg samples
    img_shape = cur_image.shape
    pos_examples = gen_samples(SampleGenerator('gaussian', (img_shape[1], img_shape[0]), 0.1, 1.2),
                               target_bb, opts['n_pos_init'], opts['overlap_pos_init'])
    neg_examples = gen_samples(SampleGenerator('uniform', (img_shape[1], img_shape[0]), 1, 2, 1.1),
                               target_bb, opts['n_neg_init'], opts['overlap_neg_init'])
    neg_examples = np.random.permutation(neg_examples)

    cur_bbreg_examples = gen_samples(SampleGenerator('uniform', (img_shape[1], img_shape[0]), 0.3, 1.5, 1.1),
                                     target_bb, opts['n_bbreg'], opts['overlap_bbreg'], opts['scale_bbreg'])

    # compute padded sample
    padded_x1 = (neg_examples[:, 0] - neg_examples[:, 2] * (opts['padding'] - 1.) / 2.).min()
    padded_y1 = (neg_examples[:, 1] - neg_examples[:, 3] * (opts['padding'] - 1.) / 2.).min()
    padded_x2 = (neg_examples[:, 0] + neg_examples[:, 2] * (opts['padding'] + 1.) / 2.).max()
    padded_y2 = (neg_examples[:, 1] + neg_examples[:, 3] * (opts['padding'] + 1.) / 2.).max()
    padded_scene_box = np.reshape(np.asarray((padded_x1, padded_y1, padded_x2 - padded_x1, padded_y2 - padded_y1)),
                                  (1, 4))

    scene_boxes = np.reshape(np.copy(padded_scene_box), (1, 4))
    if opts['jitter']:
        # horizontal shift
        jittered_scene_box_horizon = np.copy(padded_scene_box)
        jittered_scene_box_horizon[0, 0] -= 4.
        jitter_scale_horizon = 1.

        # vertical shift
        jittered_scene_box_vertical = np.copy(padded_scene_box)
        jittered_scene_box_vertical[0, 1] -= 4.
        jitter_scale_vertical = 1.

        jittered_scene_box_reduce1 = np.copy(padded_scene_box)
        jitter_scale_reduce1 = 1.1 ** (-1)

        # vertical shift
        jittered_scene_box_enlarge1 = np.copy(padded_scene_box)
        jitter_scale_enlarge1 = 1.1 ** (1)

        # scale reduction
        jittered_scene_box_reduce2 = np.copy(padded_scene_box)
        jitter_scale_reduce2 = 1.1 ** (-2)

        # scale enlarge
        jittered_scene_box_enlarge2 = np.copy(padded_scene_box)
        jitter_scale_enlarge2 = 1.1 ** (2)

        scene_boxes = np.concatenate(
            [scene_boxes, jittered_scene_box_horizon, jittered_scene_box_vertical, jittered_scene_box_reduce1,
             jittered_scene_box_enlarge1, jittered_scene_box_reduce2, jittered_scene_box_enlarge2], axis=0)
        jitter_scale = [1., jitter_scale_horizon, jitter_scale_vertical, jitter_scale_reduce1, jitter_scale_enlarge1,
                        jitter_scale_reduce2, jitter_scale_enlarge2]
    else:
        jitter_scale = [1.]

    model.eval()
    for bidx in range(0, scene_boxes.shape[0]):
        crop_img_size = (scene_boxes[bidx, 2:4] * ((opts['img_size'], opts['img_size']) / target_bb[2:4])).astype(
            'int64') * jitter_scale[bidx]
        cropped_image, cur_image_var = img_crop_model.crop_image(cur_image, np.reshape(scene_boxes[bidx], (1, 4)),
                                                                 crop_img_size)
        cropped_image = cropped_image - 128.

        feat_map = model(cropped_image, out_layer='conv3')

        rel_target_bbox = np.copy(target_bb)
        rel_target_bbox[0:2] -= scene_boxes[bidx, 0:2]

        batch_num = np.zeros((pos_examples.shape[0], 1))
        cur_pos_rois = np.copy(pos_examples)
        cur_pos_rois[:, 0:2] -= np.repeat(np.reshape(scene_boxes[bidx, 0:2], (1, 2)), cur_pos_rois.shape[0], axis=0)
        scaled_obj_size = float(opts['img_size']) * jitter_scale[bidx]
        cur_pos_rois = samples2maskroi(cur_pos_rois, model.receptive_field, (scaled_obj_size, scaled_obj_size),
                                       target_bb[2:4], opts['padding'])
        cur_pos_rois = np.concatenate((batch_num, cur_pos_rois), axis=1)
        cur_pos_rois = Variable(torch.from_numpy(cur_pos_rois.astype('float32'))).cuda()
        cur_pos_feats = model.roi_align_model(feat_map, cur_pos_rois)
        cur_pos_feats = cur_pos_feats.view(cur_pos_feats.size(0), -1).data.clone()

        batch_num = np.zeros((neg_examples.shape[0], 1))
        cur_neg_rois = np.copy(neg_examples)
        cur_neg_rois[:, 0:2] -= np.repeat(np.reshape(scene_boxes[bidx, 0:2], (1, 2)), cur_neg_rois.shape[0], axis=0)
        cur_neg_rois = samples2maskroi(cur_neg_rois, model.receptive_field, (scaled_obj_size, scaled_obj_size),
                                       target_bb[2:4], opts['padding'])
        cur_neg_rois = np.concatenate((batch_num, cur_neg_rois), axis=1)
        cur_neg_rois = Variable(torch.from_numpy(cur_neg_rois.astype('float32'))).cuda()
        cur_neg_feats = model.roi_align_model(feat_map, cur_neg_rois)
        cur_neg_feats = cur_neg_feats.view(cur_neg_feats.size(0), -1).data.clone()

        # bbreg rois
        batch_num = np.zeros((cur_bbreg_examples.shape[0], 1))
        cur_bbreg_rois = np.copy(cur_bbreg_examples)
        cur_bbreg_rois[:, 0:2] -= np.repeat(np.reshape(scene_boxes[bidx, 0:2], (1, 2)), cur_bbreg_rois.shape[0], axis=0)
        scaled_obj_size = float(opts['img_size']) * jitter_scale[bidx]
        cur_bbreg_rois = samples2maskroi(cur_bbreg_rois, model.receptive_field, (scaled_obj_size, scaled_obj_size),
                                         target_bb[2:4], opts['padding'])
        cur_bbreg_rois = np.concatenate((batch_num, cur_bbreg_rois), axis=1)
        cur_bbreg_rois = Variable(torch.from_numpy(cur_bbreg_rois.astype('float32'))).cuda()
        cur_bbreg_feats = model.roi_align_model(feat_map, cur_bbreg_rois)
        cur_bbreg_feats = cur_bbreg_feats.view(cur_bbreg_feats.size(0), -1).data.clone()

        feat_dim = cur_pos_feats.size(-1)

        if bidx == 0:
            pos_feats = cur_pos_feats
            neg_feats = cur_neg_feats
            # bbreg feature
            bbreg_feats = cur_bbreg_feats
            bbreg_examples = cur_bbreg_examples
        else:
            pos_feats = torch.cat((pos_feats, cur_pos_feats), dim=0)
            neg_feats = torch.cat((neg_feats, cur_neg_feats), dim=0)
            # bbreg feature
            bbreg_feats = torch.cat((bbreg_feats, cur_bbreg_feats), dim=0)
            bbreg_examples = np.concatenate((bbreg_examples, cur_bbreg_examples), axis=0)

    if pos_feats.size(0) > opts['n_pos_init']:
        pos_idx = np.asarray(list(range(pos_feats.size(0))))
        np.random.shuffle(pos_idx)
        pos_feats = pos_feats[pos_idx[0:opts['n_pos_init']], :]
    if neg_feats.size(0) > opts['n_neg_init']:
        neg_idx = np.asarray(list(range(neg_feats.size(0))))
        np.random.shuffle(neg_idx)
        neg_feats = neg_feats[neg_idx[0:opts['n_neg_init']], :]

    # bbreg
    if bbreg_feats.size(0) > opts['n_bbreg']:
        bbreg_idx = np.asarray(list(range(bbreg_feats.size(0))))
        np.random.shuffle(bbreg_idx)
        bbreg_feats = bbreg_feats[bbreg_idx[0:opts['n_bbreg']], :]
        bbreg_examples = bbreg_examples[bbreg_idx[0:opts['n_bbreg']], :]
        # print bbreg_examples.shape

    # open images and crop patch from obj
    extra_obj_size = np.array((opts['img_size'], opts['img_size']))
    extra_crop_img_size = extra_obj_size * (opts['padding'] + 0.6)
    replicateNum = 100
    for iidx in range(replicateNum):
        extra_target_bbox = np.copy(target_bb)

        extra_scene_box = np.copy(extra_target_bbox)
        extra_scene_box_center = extra_scene_box[0:2] + extra_scene_box[2:4] / 2.
        extra_scene_box_size = extra_scene_box[2:4] * (opts['padding'] + 0.6)
        extra_scene_box[0:2] = extra_scene_box_center - extra_scene_box_size / 2.
        extra_scene_box[2:4] = extra_scene_box_size

        extra_shift_offset = np.clip(2. * np.random.randn(2), -4, 4)
        cur_extra_scale = 1.1 ** np.clip(np.random.randn(1), -2, 2)

        extra_scene_box[0] += extra_shift_offset[0]
        extra_scene_box[1] += extra_shift_offset[1]
        extra_scene_box[2:4] *= cur_extra_scale[0]

        scaled_obj_size = float(opts['img_size']) / cur_extra_scale[0]

        cur_extra_cropped_image, _ = img_crop_model.crop_image(cur_image, np.reshape(extra_scene_box, (1, 4)),
                                                               extra_crop_img_size)
        cur_extra_cropped_image = cur_extra_cropped_image.detach()

        cur_extra_pos_examples = gen_samples(SampleGenerator('gaussian', (img_shape[1], img_shape[0]), 0.1, 1.2),
                                             extra_target_bbox, opts['n_pos_init'] // replicateNum,
                                             opts['overlap_pos_init'])
        cur_extra_neg_examples = gen_samples(SampleGenerator('uniform', (img_shape[1], img_shape[0]), 0.3, 2, 1.1),
                                             extra_target_bbox, opts['n_neg_init'] // replicateNum // 4,
                                             opts['overlap_neg_init'])

        # bbreg sample
        cur_extra_bbreg_examples = gen_samples(SampleGenerator('uniform', (img_shape[1], img_shape[0]), 0.3, 1.5, 1.1),
                                               extra_target_bbox, opts['n_bbreg'] // replicateNum // 4,
                                               opts['overlap_bbreg'], opts['scale_bbreg'])

        batch_num = iidx * np.ones((cur_extra_pos_examples.shape[0], 1))
        cur_extra_pos_rois = np.copy(cur_extra_pos_examples)
        cur_extra_pos_rois[:, 0:2] -= np.repeat(np.reshape(extra_scene_box[0:2], (1, 2)),
                                                cur_extra_pos_rois.shape[0], axis=0)
        cur_extra_pos_rois = samples2maskroi(cur_extra_pos_rois, model.receptive_field,
                                             (scaled_obj_size, scaled_obj_size), extra_target_bbox[2:4],
                                             opts['padding'])
        cur_extra_pos_rois = np.concatenate((batch_num, cur_extra_pos_rois), axis=1)

        batch_num = iidx * np.ones((cur_extra_neg_examples.shape[0], 1))
        cur_extra_neg_rois = np.copy(cur_extra_neg_examples)
        cur_extra_neg_rois[:, 0:2] -= np.repeat(np.reshape(extra_scene_box[0:2], (1, 2)), cur_extra_neg_rois.shape[0],
                                                axis=0)
        cur_extra_neg_rois = samples2maskroi(cur_extra_neg_rois, model.receptive_field,
                                             (scaled_obj_size, scaled_obj_size), extra_target_bbox[2:4],
                                             opts['padding'])
        cur_extra_neg_rois = np.concatenate((batch_num, cur_extra_neg_rois), axis=1)

        # bbreg rois
        batch_num = iidx * np.ones((cur_extra_bbreg_examples.shape[0], 1))
        cur_extra_bbreg_rois = np.copy(cur_extra_bbreg_examples)
        cur_extra_bbreg_rois[:, 0:2] -= np.repeat(np.reshape(extra_scene_box[0:2], (1, 2)),
                                                  cur_extra_bbreg_rois.shape[0], axis=0)
        cur_extra_bbreg_rois = samples2maskroi(cur_extra_bbreg_rois, model.receptive_field,
                                               (scaled_obj_size, scaled_obj_size), extra_target_bbox[2:4],
                                               opts['padding'])
        cur_extra_bbreg_rois = np.concatenate((batch_num, cur_extra_bbreg_rois), axis=1)

        if iidx == 0:
            extra_cropped_image = cur_extra_cropped_image

            extra_pos_rois = np.copy(cur_extra_pos_rois)
            extra_neg_rois = np.copy(cur_extra_neg_rois)
            # bbreg rois
            extra_bbreg_rois = np.copy(cur_extra_bbreg_rois)
            extra_bbreg_examples = np.copy(cur_extra_bbreg_examples)
        else:
            extra_cropped_image = torch.cat((extra_cropped_image, cur_extra_cropped_image), dim=0)

            extra_pos_rois = np.concatenate((extra_pos_rois, np.copy(cur_extra_pos_rois)), axis=0)
            extra_neg_rois = np.concatenate((extra_neg_rois, np.copy(cur_extra_neg_rois)), axis=0)
            # bbreg rois
            extra_bbreg_rois = np.concatenate((extra_bbreg_rois, np.copy(cur_extra_bbreg_rois)), axis=0)
            extra_bbreg_examples = np.concatenate((extra_bbreg_examples, np.copy(cur_extra_bbreg_examples)), axis=0)

    extra_pos_rois = Variable(torch.from_numpy(extra_pos_rois.astype('float32'))).cuda()
    extra_neg_rois = Variable(torch.from_numpy(extra_neg_rois.astype('float32'))).cuda()
    # bbreg rois
    extra_bbreg_rois = Variable(torch.from_numpy(extra_bbreg_rois.astype('float32'))).cuda()

    extra_cropped_image -= 128.

    extra_feat_maps = model(extra_cropped_image, out_layer='conv3')
    # Draw pos/neg samples
    img_shape = cur_image.shape

    extra_pos_feats = model.roi_align_model(extra_feat_maps, extra_pos_rois)
    extra_pos_feats = extra_pos_feats.view(extra_pos_feats.size(0), -1).data.clone()

    extra_neg_feats = model.roi_align_model(extra_feat_maps, extra_neg_rois)
    extra_neg_feats = extra_neg_feats.view(extra_neg_feats.size(0), -1).data.clone()
    # bbreg feat
    extra_bbreg_feats = model.roi_align_model(extra_feat_maps, extra_bbreg_rois)
    extra_bbreg_feats = extra_bbreg_feats.view(extra_bbreg_feats.size(0), -1).data.clone()

    # concatenate extra features to original_features
    pos_feats = torch.cat((pos_feats, extra_pos_feats), dim=0)
    neg_feats = torch.cat((neg_feats, extra_neg_feats), dim=0)
    # concatenate extra bbreg feats to original_bbreg_feats
    bbreg_feats = torch.cat((bbreg_feats, extra_bbreg_feats), dim=0)
    bbreg_examples = np.concatenate((bbreg_examples, extra_bbreg_examples), axis=0)

    torch.cuda.empty_cache()
    model.zero_grad()

    # Initial training
    train(model, None, criterion, init_optimizer, pos_feats, neg_feats, opts['maxiter_init'], **opts)
    #del init_optimizer, neg_feats
    if opts['use_gpu']:
        torch.cuda.empty_cache()
    g_pretrain(model, model_g, criterion_g, pos_feats, **opts)
    if opts['use_gpu']:
        torch.cuda.empty_cache()

    # bbreg train
    if bbreg_feats.size(0) > opts['n_bbreg']:
        bbreg_idx = np.asarray(list(range(bbreg_feats.size(0))))
        np.random.shuffle(bbreg_idx)
        bbreg_feats = bbreg_feats[bbreg_idx[0:opts['n_bbreg']], :]
        bbreg_examples = bbreg_examples[bbreg_idx[0:opts['n_bbreg']], :]
    bbreg = BBRegressor((img_shape[1], img_shape[0]))
    bbreg.train(bbreg_feats, bbreg_examples, target_bb)

    if pos_feats.size(0) > opts['n_pos_update']:
        pos_idx = np.asarray(list(range(pos_feats.size(0))))
        np.random.shuffle(pos_idx)
        pos_feats_all = [pos_feats.index_select(0, torch.from_numpy(pos_idx[0:opts['n_pos_update']]).cuda())]
    if neg_feats.size(0) > opts['n_neg_update']:
        neg_idx = np.asarray(list(range(neg_feats.size(0))))
        np.random.shuffle(neg_idx)
        neg_feats_all = [neg_feats.index_select(0, torch.from_numpy(neg_idx[0:opts['n_neg_update']]).cuda())]

    spf_total = time.time() - tic
    # spf_total = 0. # no first frame

    # Visualize
    savefig = opts['savefig_dir'] != ''
    if opts['visualize'] or savefig:
        dpi = 80.0
        figsize = (cur_image.shape[1] / dpi, cur_image.shape[0] / dpi)

        fig = plt.figure(frameon=False, figsize=figsize, dpi=dpi)
        ax = plt.Axes(fig, [0., 0., 1., 1.])
        ax.set_axis_off()
        fig.add_axes(ax)
        im = ax.imshow(cur_image, aspect='normal')

        if gt is not None:
            gt_rect = plt.Rectangle(tuple(gt[0, :2]), gt[0, 2], gt[0, 3],
                                    linewidth=3, edgecolor="#00ff00", zorder=1, fill=False)
            ax.add_patch(gt_rect)

        rect = plt.Rectangle(tuple(result_bb[0, :2]), result_bb[0, 2], result_bb[0, 3],
                             linewidth=3, edgecolor="#ff0000", zorder=1, fill=False)
        ax.add_patch(rect)

        if opts['visualize']:
            plt.pause(.01)
            plt.draw()
        if savefig:
            fig.savefig(os.path.join(opts['savefig_dir'], '0000.jpg'), dpi=dpi)

    # Main loop
    trans_f = opts['trans_f']
    for i in range(1, len(img_list)):

        tic = time.time()
        # Load image
        cur_image = Image.open(img_list[i]).convert('RGB')
        cur_image = np.asarray(cur_image)

        # Estimate target bbox
        img_shape = cur_image.shape
        samples = gen_samples(
            SampleGenerator('gaussian', (img_shape[1], img_shape[0]), trans_f, opts['scale_f'], valid=True),
            target_bb, opts['n_samples'])

        padded_x1 = (samples[:, 0] - samples[:, 2] * (opts['padding'] - 1.) / 2.).min()
        padded_y1 = (samples[:, 1] - samples[:, 3] * (opts['padding'] - 1.) / 2.).min()
        padded_x2 = (samples[:, 0] + samples[:, 2] * (opts['padding'] + 1.) / 2.).max()
        padded_y2 = (samples[:, 1] + samples[:, 3] * (opts['padding'] + 1.) / 2.).max()
        padded_scene_box = np.asarray((padded_x1, padded_y1, padded_x2 - padded_x1, padded_y2 - padded_y1))

        if padded_scene_box[0] > cur_image.shape[1]:
            padded_scene_box[0] = cur_image.shape[1] - 1
        if padded_scene_box[1] > cur_image.shape[0]:
            padded_scene_box[1] = cur_image.shape[0] - 1
        if padded_scene_box[0] + padded_scene_box[2] < 0:
            padded_scene_box[2] = -padded_scene_box[0] + 1
        if padded_scene_box[1] + padded_scene_box[3] < 0:
            padded_scene_box[3] = -padded_scene_box[1] + 1

        crop_img_size = (padded_scene_box[2:4] * ((opts['img_size'], opts['img_size']) / target_bb[2:4])).astype(
            'int64')
        cropped_image, cur_image_var = img_crop_model.crop_image(cur_image, np.reshape(padded_scene_box, (1, 4)),
                                                                 crop_img_size)
        cropped_image = cropped_image - 128.

        model.eval()
        feat_map = model(cropped_image, out_layer='conv3')

        # relative target bbox with padded_scene_box
        rel_target_bbox = np.copy(target_bb)
        rel_target_bbox[0:2] -= padded_scene_box[0:2]

        # Extract sample features and get target location
        batch_num = np.zeros((samples.shape[0], 1))
        sample_rois = np.copy(samples)
        sample_rois[:, 0:2] -= np.repeat(np.reshape(padded_scene_box[0:2], (1, 2)), sample_rois.shape[0], axis=0)
        sample_rois = samples2maskroi(sample_rois, model.receptive_field, (opts['img_size'], opts['img_size']),
                                      target_bb[2:4], opts['padding'])
        sample_rois = np.concatenate((batch_num, sample_rois), axis=1)
        sample_rois = Variable(torch.from_numpy(sample_rois.astype('float32'))).cuda()
        sample_feats = model.roi_align_model(feat_map, sample_rois)
        sample_feats = sample_feats.view(sample_feats.size(0), -1).clone()
        sample_scores = model(sample_feats, in_layer='fc4')
        top_scores, top_idx = sample_scores[:, 1].topk(5)
        top_idx = top_idx.data.cpu().numpy()
        target_score = top_scores.data.mean()
        target_bb = samples[top_idx].mean(axis=0)

        success = target_score > opts['success_thr']

        # Expand search area at failure
        if success:
            trans_f = opts['trans_f']
        else:
            trans_f = opts['trans_f_expand']

        # bb regression
        if success:
            bbreg_feats = sample_feats[top_idx, :]
            bbreg_samples = samples[top_idx]
            bbreg_samples = bbreg.predict(bbreg_feats.data, bbreg_samples)
            bbreg_bbox = bbreg_samples.mean(axis=0)
        else:
            bbreg_bbox = target_bb

        # Save result
        result[i] = target_bb
        result_bb[i] = bbreg_bbox
        iou_result[i] = 1.

        # Data collect
        if success:

            # Draw pos/neg samples
            pos_examples = gen_samples(
                SampleGenerator('gaussian', (img_shape[1], img_shape[0]), 0.1, 1.2), target_bb,
                opts['n_pos_update'],
                opts['overlap_pos_update'])
            neg_examples = gen_samples(
                SampleGenerator('uniform', (img_shape[1], img_shape[0]), 1.5, 1.2), target_bb,
                opts['n_neg_update'],
                opts['overlap_neg_update'])

            padded_x1 = (neg_examples[:, 0] - neg_examples[:, 2] * (opts['padding'] - 1.) / 2.).min()
            padded_y1 = (neg_examples[:, 1] - neg_examples[:, 3] * (opts['padding'] - 1.) / 2.).min()
            padded_x2 = (neg_examples[:, 0] + neg_examples[:, 2] * (opts['padding'] + 1.) / 2.).max()
            padded_y2 = (neg_examples[:, 1] + neg_examples[:, 3] * (opts['padding'] + 1.) / 2.).max()
            padded_scene_box = np.reshape(
                np.asarray((padded_x1, padded_y1, padded_x2 - padded_x1, padded_y2 - padded_y1)), (1, 4))

            scene_boxes = np.reshape(np.copy(padded_scene_box), (1, 4))
            jitter_scale = [1.]

            for bidx in range(0, scene_boxes.shape[0]):
                crop_img_size = (scene_boxes[bidx, 2:4] * (
                        (opts['img_size'], opts['img_size']) / target_bb[2:4])).astype('int64') * jitter_scale[
                                    bidx]
                cropped_image, cur_image_var = img_crop_model.crop_image(cur_image,
                                                                         np.reshape(scene_boxes[bidx], (1, 4)),
                                                                         crop_img_size)
                cropped_image = cropped_image - 128.

                feat_map = model(cropped_image, out_layer='conv3')

                rel_target_bbox = np.copy(target_bb)
                rel_target_bbox[0:2] -= scene_boxes[bidx, 0:2]

                batch_num = np.zeros((pos_examples.shape[0], 1))
                cur_pos_rois = np.copy(pos_examples)
                cur_pos_rois[:, 0:2] -= np.repeat(np.reshape(scene_boxes[bidx, 0:2], (1, 2)), cur_pos_rois.shape[0],
                                                  axis=0)
                scaled_obj_size = float(opts['img_size']) * jitter_scale[bidx]
                cur_pos_rois = samples2maskroi(cur_pos_rois, model.receptive_field, (scaled_obj_size, scaled_obj_size),
                                               target_bb[2:4], opts['padding'])
                cur_pos_rois = np.concatenate((batch_num, cur_pos_rois), axis=1)
                cur_pos_rois = Variable(torch.from_numpy(cur_pos_rois.astype('float32'))).cuda()
                cur_pos_feats = model.roi_align_model(feat_map, cur_pos_rois)
                cur_pos_feats = cur_pos_feats.view(cur_pos_feats.size(0), -1).data.clone()

                batch_num = np.zeros((neg_examples.shape[0], 1))
                cur_neg_rois = np.copy(neg_examples)
                cur_neg_rois[:, 0:2] -= np.repeat(np.reshape(scene_boxes[bidx, 0:2], (1, 2)), cur_neg_rois.shape[0],
                                                  axis=0)
                cur_neg_rois = samples2maskroi(cur_neg_rois, model.receptive_field, (scaled_obj_size, scaled_obj_size),
                                               target_bb[2:4], opts['padding'])
                cur_neg_rois = np.concatenate((batch_num, cur_neg_rois), axis=1)
                cur_neg_rois = Variable(torch.from_numpy(cur_neg_rois.astype('float32'))).cuda()
                cur_neg_feats = model.roi_align_model(feat_map, cur_neg_rois)
                cur_neg_feats = cur_neg_feats.view(cur_neg_feats.size(0), -1).data.clone()

                feat_dim = cur_pos_feats.size(-1)

                if bidx == 0:
                    pos_feats = cur_pos_feats  ##index select
                    neg_feats = cur_neg_feats
                else:
                    pos_feats = torch.cat((pos_feats, cur_pos_feats), dim=0)
                    neg_feats = torch.cat((neg_feats, cur_neg_feats), dim=0)

            if pos_feats.size(0) > opts['n_pos_update']:
                pos_idx = np.asarray(list(range(pos_feats.size(0))))
                np.random.shuffle(pos_idx)
                pos_feats = pos_feats.index_select(0, torch.from_numpy(pos_idx[0:opts['n_pos_update']]).cuda())
            if neg_feats.size(0) > opts['n_neg_update']:
                neg_idx = np.asarray(list(range(neg_feats.size(0))))
                np.random.shuffle(neg_idx)
                neg_feats = neg_feats.index_select(0, torch.from_numpy(neg_idx[0:opts['n_neg_update']]).cuda())

            pos_feats_all.append(pos_feats)
            neg_feats_all.append(neg_feats)

            if len(pos_feats_all) > opts['n_frames_long']:
                del pos_feats_all[0]
            if len(neg_feats_all) > opts['n_frames_short']:
                del neg_feats_all[0]

        # Short term update
        if not success:
            nframes = min(opts['n_frames_short'], len(pos_feats_all))
            pos_data = torch.stack(pos_feats_all[-nframes:], 0).view(-1, feat_dim)
            neg_data = torch.stack(neg_feats_all, 0).view(-1, feat_dim)
            train(model, None, criterion, update_optimizer, pos_data, neg_data, opts['maxiter_update'], **opts)

        # Long term update
        elif i % opts['long_interval'] == 0:
            pos_data = torch.stack(pos_feats_all, 0).view(-1, feat_dim)
            neg_data = torch.stack(neg_feats_all, 0).view(-1, feat_dim)
            train(model, model_g, criterion, update_optimizer, pos_data, neg_data, opts['maxiter_update'], **opts)

        spf = time.time() - tic
        spf_total += spf

        # Visualize
        if opts['visualize'] or savefig:
            im.set_data(cur_image)

            if gt is not None:
                gt_rect.set_xy(gt[i, :2])
                gt_rect.set_width(gt[i, 2])
                gt_rect.set_height(gt[i, 3])

            rect.set_xy(result_bb[i, :2])
            rect.set_width(result_bb[i, 2])
            rect.set_height(result_bb[i, 3])

            if opts['visualize']:
                plt.pause(.01)
                plt.draw()
            if savefig:
                fig.savefig(os.path.join(opts['savefig_dir'], '%04d.jpg' % i), dpi=dpi)

        if opts['visual_log']:
            if gt is None:
                print("Frame %d/%d, Score %.3f, Time %.3f" % (i, len(img_list), target_score, spf))
            else:
                print("Frame %d/%d, Overlap %.3f, Score %.3f, Time %.3f" % \
                      (i, len(img_list), overlap_ratio(gt[i], result_bb[i])[0], target_score, spf))
        iou_result[i] = overlap_ratio(gt[i], result_bb[i])[0]

    fps = len(img_list) / spf_total
    # fps = (len(img_list)-1) / spf_total #no first frame
    return iou_result, result_bb, fps, result
    def forward(
            self,  # type: ignore
            question: Dict[str, torch.LongTensor],
            passage: Dict[str, torch.LongTensor],
            answer: Dict[str, torch.LongTensor],
            dialog: Dict[str, torch.LongTensor],
            previous_answer_appended: Dict[str, torch.LongTensor],
            span_start: torch.IntTensor = None,
            span_end: torch.IntTensor = None,
            p1_answer_marker: torch.IntTensor = None,
            p2_answer_marker: torch.IntTensor = None,
            p3_answer_marker: torch.IntTensor = None,
            yesno_list: torch.IntTensor = None,
            followup_list: torch.IntTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        p1_answer_marker : ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 0.
            This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length].
            Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer
            in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>.
            For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac
        p2_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 1.
            It is similar to p1_answer_marker, but marking previous previous answer in passage.
        p3_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 2.
            It is similar to p1_answer_marker, but marking previous previous previous answer in passage.
        yesno_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (the yes/no/not a yes no question).
        followup_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (followup / maybe followup / don't followup).
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of the followings.
        Each of the followings is a nested list because first iterates over dialog, then questions in dialog.

        qid : List[List[str]]
            A list of list, consisting of question ids.
        followup : List[List[int]]
            A list of list, consisting of continuation marker prediction index.
            (y :yes, m: maybe follow up, n: don't follow up)
        yesno : List[List[int]]
            A list of list, consisting of affirmation marker prediction index.
            (y :yes, x: not a yes/no question, n: np)
        best_span_str : List[List[str]]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        #question = previous_answer_appended
        batch_size, max_qa_count, max_q_len, _ = question[
            'token_characters'].size()
        #logger.info("dialog shape token charcaters is %s %s", dialog['token_characters'].size(), dialog['elmo'].size())
        #logger.info("question shape token charcaters is %s %s", question['token_characters'].size(), question['elmo'].size())
        batch_size, max_dia_count, max_dia_len, _ = dialog[
            'token_characters'].size()
        total_qa_count = batch_size * max_qa_count
        qa_mask = torch.ge(followup_list, 0).view(total_qa_count)
        embedded_question = self._text_field_embedder(question,
                                                      num_wrapping_dims=1)
        #logger.info("11111111111 dialog is %s", dialog['token_characters'].shape)
        #logger.info("11111111111 dialog is %s", dialog['elmo'].shape)

        embedded_dialog = self._text_field_embedder(dialog,
                                                    num_wrapping_dims=1)
        embedded_question = embedded_question.reshape(
            total_qa_count, max_q_len,
            self._text_field_embedder.get_output_dim())
        embedded_dialog = embedded_dialog.reshape(
            total_qa_count, max_dia_len,
            self._text_field_embedder.get_output_dim())
        embedded_question = self._variational_dropout(embedded_question)
        embedded_dialog = self._variational_dropout(embedded_dialog)
        embedded_passage = self._variational_dropout(
            self._text_field_embedder(passage))
        passage_length = embedded_passage.size(1)
        #logger.info("embedded question has shape %s", embedded_question.shape)
        #logger.info("embedded dialog has shape %s", embedded_dialog.shape)
        question_mask = util.get_text_field_mask(question,
                                                 num_wrapping_dims=1).float()
        question_mask = question_mask.reshape(total_qa_count, max_q_len)
        dialog_mask = util.get_text_field_mask(dialog,
                                               num_wrapping_dims=1).float()
        dialog_mask = dialog_mask.reshape(total_qa_count, max_dia_len)
        passage_mask = util.get_text_field_mask(passage).float()
        #logger.info("dialog shape token charcaters is %s %s", dialog['token_characters'].size(), dialog['elmo'].size())
        #logger.info("answer shape token charcaters is %s %s", answer['token_characters'].size(), answer['elmo'].size())
        #logger.info("quesion shape token charcaters is %s %s", question['token_characters'].size(), question['elmo'].size())
        #logger.info("previous answer shape token charcaters is %s %s", previous_answer_appended['token_characters'].size(), previous_answer_appended['elmo'].size())
        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(
            1, max_qa_count, 1)
        repeated_passage_mask = repeated_passage_mask.view(
            total_qa_count, passage_length)

        if self._num_context_answers > 0:
            # Encode question turn number inside the dialog into question embedding.
            question_num_ind = util.get_range_vector(
                max_qa_count, util.get_device_of(embedded_question))
            question_num_ind = question_num_ind.unsqueeze(-1).repeat(
                1, max_q_len)
            question_num_ind = question_num_ind.unsqueeze(0).repeat(
                batch_size, 1, 1)
            question_num_ind = question_num_ind.reshape(
                total_qa_count, max_q_len)
            question_num_marker_emb = self._question_num_marker(
                question_num_ind)
            embedded_question = torch.cat(
                [embedded_question, question_num_marker_emb], dim=-1)

            # Append dialog number for dialog
            question_num_ind = util.get_range_vector(
                max_dia_count, util.get_device_of(embedded_dialog))
            question_num_ind = question_num_ind.unsqueeze(-1).repeat(
                1, max_dia_len)
            question_num_ind = question_num_ind.unsqueeze(0).repeat(
                batch_size, 1, 1)
            question_num_ind = question_num_ind.reshape(
                total_qa_count, max_dia_len)
            question_num_marker_emb = self._question_num_marker(
                question_num_ind)
            embedded_dialog = torch.cat(
                [embedded_dialog, question_num_marker_emb], dim=-1)

            # Encode the previous answers in passage embedding.
            repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \
                view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim())
            # batch_size * max_qa_count, passage_length, word_embed_dim
            p1_answer_marker = p1_answer_marker.view(total_qa_count,
                                                     passage_length)
            p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker)
            repeated_embedded_passage = torch.cat(
                [repeated_embedded_passage, p1_answer_marker_emb], dim=-1)
            if self._num_context_answers > 1:
                p2_answer_marker = p2_answer_marker.view(
                    total_qa_count, passage_length)
                p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker)
                repeated_embedded_passage = torch.cat(
                    [repeated_embedded_passage, p2_answer_marker_emb], dim=-1)
                if self._num_context_answers > 2:
                    p3_answer_marker = p3_answer_marker.view(
                        total_qa_count, passage_length)
                    p3_answer_marker_emb = self._prev_ans_marker(
                        p3_answer_marker)
                    repeated_embedded_passage = torch.cat(
                        [repeated_embedded_passage, p3_answer_marker_emb],
                        dim=-1)

            repeated_encoded_passage = self._variational_dropout(
                self._phrase_layer(repeated_embedded_passage,
                                   repeated_passage_mask))
        else:
            encoded_passage = self._variational_dropout(
                self._phrase_layer(embedded_passage, passage_mask))
            repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(
                1, max_qa_count, 1, 1)
            repeated_encoded_passage = repeated_encoded_passage.view(
                total_qa_count, passage_length, self._encoding_dim)
        #logger.info("repeated encoded passage has shape %s", repeated_encoded_passage.shape)
        #logger.info("embedded question has shape %s", embedded_question.shape)
        #logger.info("question mask has shape %s",  question_mask.shape)
        #logger.info("embedded dialog has shape %s", embedded_dialog.shape)
        #logger.info("dialog mask has shape %s",  dialog_mask.shape)

        encoded_question = self._variational_dropout(
            self._phrase_layer(embedded_question, question_mask))
        encoded_dialog = self._variational_dropout(
            self._phrase_layer(embedded_dialog, dialog_mask))

        #logger.info("encoded_question is %s", encoded_question.shape)
        #logger.info("encoded_dialog is %s", encoded_dialog.shape)
        #logger.info("encoded_passage is %s", repeated_encoded_passage.shape)

        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(
            repeated_encoded_passage, encoded_question)
        #logger.info("passage_question_similarity is %s", passage_question_similarity.shape)
        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        #logger.info("question_mask is %s", question_mask.shape)
        passage_question_attention = util.masked_softmax(
            passage_question_similarity, question_mask)
        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(
            encoded_question, passage_question_attention)
        #logger.info("passage question vectors is %s", passage_question_vectors.shape)

        ############################# DIALOG SIMILARITY STUFF ################################################################
        dialog_question_similarity = self._matrix_attention(
            encoded_question, encoded_dialog)
        #logger.info("dialog question similarity is %s", dialog_question_similarity.shape)
        #logger.info("dialog_mask is %s", dialog_mask.shape)
        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        dialog_question_attention = util.masked_softmax(
            dialog_question_similarity, dialog_mask)
        #logger.info("dialog question attention is %s", dialog_question_attention.shape)
        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim)
        question_dialog_vectors = util.weighted_sum(encoded_dialog,
                                                    dialog_question_attention)
        #logger.info("question_dialog_vectors is %s", question_dialog_vectors.shape)
        #logger.info("encoded_question 111  %s", encoded_question.shape)

        #logger.info("encoded_question 2222  %s", encoded_question.shape)
        #logger.info("self._encoding_dim  %s", self._encoding_dim)

        encoded_question = torch.cat(
            [encoded_question, question_dialog_vectors], dim=-1)

        encoded_question = F.relu(self.t(encoded_question))

        #logger.info("encoded_question 3333333  %s", encoded_question.shape)

        ######################################################################################################################

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        #if max_qa_count == 7 and batch_size == 21:
        #    sys.exit()
        masked_similarity = util.replace_masked_values(
            passage_question_similarity, question_mask.unsqueeze(1), -1e7)

        question_passage_similarity = masked_similarity.max(
            dim=-1)[0].squeeze(-1)
        question_passage_attention = util.masked_softmax(
            question_passage_similarity, repeated_passage_mask)
        # Shape: (batch_size * max_qa_count, encoding_dim)
        question_passage_vector = util.weighted_sum(
            repeated_encoded_passage, question_passage_attention)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(
            1).expand(total_qa_count, passage_length, self._encoding_dim)

        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([
            repeated_encoded_passage, passage_question_vectors,
            repeated_encoded_passage * passage_question_vectors,
            repeated_encoded_passage * tiled_question_passage_vector
        ],
                                         dim=-1)

        final_merged_passage = F.relu(self._merge_atten(final_merged_passage))

        residual_layer = self._variational_dropout(
            self._residual_encoder(final_merged_passage,
                                   repeated_passage_mask))
        self_attention_matrix = self._self_attention(residual_layer,
                                                     residual_layer)

        mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \
               * repeated_passage_mask.reshape(total_qa_count, 1, passage_length)
        self_mask = torch.eye(passage_length,
                              passage_length,
                              device=self_attention_matrix.device)
        self_mask = self_mask.reshape(1, passage_length, passage_length)
        mask = mask * (1 - self_mask)

        self_attention_probs = util.masked_softmax(self_attention_matrix, mask)

        # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim)
        self_attention_vecs = torch.matmul(self_attention_probs,
                                           residual_layer)
        self_attention_vecs = torch.cat([
            self_attention_vecs, residual_layer,
            residual_layer * self_attention_vecs
        ],
                                        dim=-1)
        residual_layer = F.relu(
            self._merge_self_attention(self_attention_vecs))

        final_merged_passage = final_merged_passage + residual_layer
        # batch_size * maxqa_pair_len * max_passage_len * 200
        final_merged_passage = self._variational_dropout(final_merged_passage)
        start_rep = self._span_start_encoder(final_merged_passage,
                                             repeated_passage_mask)
        span_start_logits = self._span_start_predictor(start_rep).squeeze(-1)

        end_rep = self._span_end_encoder(
            torch.cat([final_merged_passage, start_rep], dim=-1),
            repeated_passage_mask)
        span_end_logits = self._span_end_predictor(end_rep).squeeze(-1)

        span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1)
        span_followup_logits = self._span_followup_predictor(end_rep).squeeze(
            -1)

        span_start_logits = util.replace_masked_values(span_start_logits,
                                                       repeated_passage_mask,
                                                       -1e7)
        # batch_size * maxqa_len_pair, max_document_len
        span_end_logits = util.replace_masked_values(span_end_logits,
                                                     repeated_passage_mask,
                                                     -1e7)

        best_span = self._get_best_span_yesno_followup(span_start_logits,
                                                       span_end_logits,
                                                       span_yesno_logits,
                                                       span_followup_logits,
                                                       self._max_span_length)

        output_dict: Dict[str, Any] = {}

        # Compute the loss.
        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits,
                                                    repeated_passage_mask),
                            span_start.view(-1),
                            ignore_index=-1)
            self._span_start_accuracy(span_start_logits,
                                      span_start.view(-1),
                                      mask=qa_mask)
            loss += nll_loss(util.masked_log_softmax(span_end_logits,
                                                     repeated_passage_mask),
                             span_end.view(-1),
                             ignore_index=-1)
            self._span_end_accuracy(span_end_logits,
                                    span_end.view(-1),
                                    mask=qa_mask)
            self._span_accuracy(best_span[:, 0:2],
                                torch.stack([span_start, span_end],
                                            -1).view(total_qa_count, 2),
                                mask=qa_mask.unsqueeze(1).expand(-1, 2).long())
            # add a select for the right span to compute loss
            gold_span_end_loc = []
            span_end = span_end.view(
                total_qa_count).squeeze().data.cpu().numpy()
            for i in range(0, total_qa_count):
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 1, 0))
                gold_span_end_loc.append(
                    max(span_end[i] * 3 + i * passage_length * 3 + 2, 0))
            gold_span_end_loc = span_start.new(gold_span_end_loc)

            pred_span_end_loc = []
            for i in range(0, total_qa_count):
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0))
                pred_span_end_loc.append(
                    max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0))
            predicted_end = span_start.new(pred_span_end_loc)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, gold_span_end_loc).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(
                0, gold_span_end_loc).view(-1, 3)
            loss += nll_loss(F.log_softmax(_yesno, dim=-1),
                             yesno_list.view(-1),
                             ignore_index=-1)
            loss += nll_loss(F.log_softmax(_followup, dim=-1),
                             followup_list.view(-1),
                             ignore_index=-1)

            _yesno = span_yesno_logits.view(-1).index_select(
                0, predicted_end).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(
                0, predicted_end).view(-1, 3)
            self._span_yesno_accuracy(_yesno,
                                      yesno_list.view(-1),
                                      mask=qa_mask)
            self._span_followup_accuracy(_followup,
                                         followup_list.view(-1),
                                         mask=qa_mask)
            output_dict["loss"] = loss

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict['followup'] = []
        output_dict['yesno'] = []
        best_span_cpu = best_span.detach().cpu().numpy()
        for i in range(batch_size):
            passage_str = metadata[i]['original_passage']
            offsets = metadata[i]['token_offsets']
            f1_score = 0.0
            per_dialog_best_span_list = []
            per_dialog_yesno_list = []
            per_dialog_followup_list = []
            per_dialog_query_id_list = []
            for per_dialog_query_index, (iid, answer_texts) in enumerate(
                    zip(metadata[i]["instance_id"],
                        metadata[i]["answer_texts_list"])):
                predicted_span = tuple(best_span_cpu[i * max_qa_count +
                                                     per_dialog_query_index])

                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]

                yesno_pred = predicted_span[2]
                followup_pred = predicted_span[3]
                per_dialog_yesno_list.append(yesno_pred)
                per_dialog_followup_list.append(followup_pred)
                per_dialog_query_id_list.append(iid)

                best_span_string = passage_str[start_offset:end_offset]
                per_dialog_best_span_list.append(best_span_string)
                if answer_texts:
                    if len(answer_texts) > 1:
                        t_f1 = []
                        # Compute F1 over N-1 human references and averages the scores.
                        for answer_index in range(len(answer_texts)):
                            idxes = list(range(len(answer_texts)))
                            idxes.pop(answer_index)
                            refs = [answer_texts[z] for z in idxes]
                            t_f1.append(
                                squad_eval.metric_max_over_ground_truths(
                                    squad_eval.f1_score, best_span_string,
                                    refs))
                        f1_score = 1.0 * sum(t_f1) / len(t_f1)
                    else:
                        f1_score = squad_eval.metric_max_over_ground_truths(
                            squad_eval.f1_score, best_span_string,
                            answer_texts)
                self._official_f1(100 * f1_score)
            output_dict['qid'].append(per_dialog_query_id_list)
            output_dict['best_span_str'].append(per_dialog_best_span_list)
            output_dict['yesno'].append(per_dialog_yesno_list)
            output_dict['followup'].append(per_dialog_followup_list)
        return output_dict
Beispiel #39
0
    def forward(self, batch):
        """
        :param batch: (p_index, q_index, zhengli_index, fuli_index, wfqd_index)
        :return:
        """

        passage = batch[0:3]
        query = batch[3:6]
        zhengli = batch[6:9]  # (batch_size, zhengli_len)
        fuli = batch[9:12]
        wfqd = batch[12:15]

        # mask
        passage_mask = utils.get_mask(passage[0])
        query_mask = utils.get_mask(query[0])
        zhengli_mask = utils.get_mask(zhengli[0])
        fuli_mask = utils.get_mask(fuli[0])
        wfqd_mask = utils.get_mask(wfqd[0])

        # embedding
        passage_vec = self.embedding(passage)
        query_vec = self.embedding(query)
        zhengli_vec = self.embedding(zhengli)
        fuli_vec = self.embedding(fuli)
        wfqd_vec = self.embedding(wfqd)

        # encoder: p, q
        passage_vec = self.encoder(
            passage_vec, passage_mask)  # (p_len, batch_size. hidden_size*2)
        passage_vec = self.dropout(passage_vec)
        query_vec = self.encoder(query_vec, query_mask)
        query_vec = self.dropout(query_vec)

        # encoder: zhengli,fuli,wfqd
        zhengli_vec = self.encoder(zhengli_vec, zhengli_mask)
        zhengli_vec = self.dropout(zhengli_vec)
        fuli_vec = self.encoder(fuli_vec, fuli_mask)
        fuli_vec = self.dropout(fuli_vec)
        wfqd_vec = self.encoder(wfqd_vec, wfqd_mask)
        wfqd_vec = self.dropout(wfqd_vec)

        # answer build
        zhengli_vec = self.mean_a(zhengli_vec,
                                  zhengli_mask)  # (batch_size, hidden_size*2)
        fuli_vec = self.mean_a(fuli_vec, fuli_mask)
        wfqd_vec = self.mean_a(wfqd_vec, wfqd_mask)

        answer = torch.stack([zhengli_vec, fuli_vec, wfqd_vec
                              ]).transpose(0,
                                           1)  # (batch_size, 3, hidden_size)

        # merge q into p, get p prep
        align_ct = passage_vec
        for i in range(self.num_align_hops):
            qt_align_ct = self.aligner[i](align_ct, query_vec, query_mask)
            bar_ct = self.aligner_sfu[i](
                align_ct,
                torch.cat([
                    qt_align_ct, align_ct * qt_align_ct, align_ct - qt_align_ct
                ],
                          dim=2))

            ct_align_ct = self.self_aligner[i](bar_ct, passage_mask)
            hat_ct = self.self_aligner_sfu[i](
                bar_ct,
                torch.cat(
                    [ct_align_ct, bar_ct * ct_align_ct, bar_ct - ct_align_ct],
                    dim=2))
            align_ct = self.choose_agg[i](hat_ct, passage_mask)
        p_prep = align_ct  # (p_len, batch_size, hidden_size*2)
        q_prep = self.mean_q(query_vec, query_mask).unsqueeze(
            0)  # (1, batch_size, hidden_size*2)
        sj = self.vp(torch.tanh(self.wp1(p_prep) +
                                self.wp2(q_prep))).transpose(
                                    0, 1)  # (batch_size, p_len, 1)
        mask = passage_mask.eq(0).unsqueeze(2)
        sj.masked_fill_(mask, -float('inf'))
        sj = f.softmax(sj, dim=1).transpose(1, 2)  # (batch_size, 1, p_len)
        p_prep = torch.bmm(sj, p_prep.transpose(0, 1)).squeeze(
            1)  # (batch_size, hidden_size*2)

        # choosing
        p_prep = self.bi_linear(p_prep)  # (batch_size, hidden_size)
        outputs = torch.bmm(answer,
                            p_prep.unsqueeze(2)).squeeze(2)  # (batch_size, 3)

        return outputs
Beispiel #40
0
    def generate(self, mels, save_path, batched, target, overlap, mu_law,
                 trg_mel=None):
        device = next(self.parameters()).device  # use same device as parameters

        mu_law = mu_law if self.mode == 'RAW' else False

        self.eval()
        output = []
        start = time.time()
        rnn1 = self.get_gru_cell(self.rnn1)
        rnn2 = self.get_gru_cell(self.rnn2)

        with torch.no_grad():

            mels = torch.as_tensor(mels, device=device)
            wave_len = (mels.size(-1) - 1) * self.hop_length
            mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
            mels = mels.transpose(1, 2)
            if trg_mel is not None and hasattr(self, 'adaptnet'):
                mels = self.adaptnet(mels, trg_mel)
            mels, aux = self.upsample(mels)

            if batched:
                mels = self.fold_with_overlap(mels, target, overlap)
                aux = self.fold_with_overlap(aux, target, overlap)

            b_size, seq_len, _ = mels.size()

            h1 = torch.zeros(b_size, self.rnn_dims, device=device)
            h2 = torch.zeros(b_size, self.rnn_dims, device=device)
            x = torch.zeros(b_size, 1, device=device)

            d = self.aux_dims
            aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]

            for i in range(seq_len):

                m_t = mels[:, i, :]

                a1_t, a2_t, a3_t, a4_t = \
                    (a[:, i, :] for a in aux_split)

                x = torch.cat([x, m_t, a1_t], dim=1)
                x = self.I(x)
                h1 = rnn1(x, h1)

                x = x + h1
                inp = torch.cat([x, a2_t], dim=1)
                h2 = rnn2(inp, h2)

                x = x + h2
                x = torch.cat([x, a3_t], dim=1)
                x = F.relu(self.fc1(x))

                x = torch.cat([x, a4_t], dim=1)
                x = F.relu(self.fc2(x))

                logits = self.fc3(x)

                if self.mode == 'MOL':
                    sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
                    output.append(sample.view(-1))
                    # x = torch.FloatTensor([[sample]]).cuda()
                    x = sample.transpose(0, 1)

                elif self.mode == 'RAW':
                    posterior = F.softmax(logits, dim=1)
                    distrib = torch.distributions.Categorical(posterior)

                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
                    output.append(sample)
                    x = sample.unsqueeze(-1)
                else:
                    raise RuntimeError("Unknown model mode value - ", self.mode)

                #if i % 100 == 0: self.gen_display(i, seq_len, b_size, start)

        output = torch.stack(output).transpose(0, 1)
        output = output.cpu().numpy()
        output = output.astype(np.float64)

        if batched:
            output = self.xfade_and_unfold(output, target, overlap)
        else:
            output = output[0]

        if mu_law:
            output = decode_mu_law(output, self.n_classes, False)

        # Fade-out at the end to avoid signal cutting out suddenly
        fade_out = np.linspace(1, 0, 20 * self.hop_length)
        output = output[:wave_len]
        output[-20 * self.hop_length:] *= fade_out

        save_wav(output, save_path)

        self.train()

        return output
 def stack(self, keys):
     data = [getattr(self, k)[:self.size] for k in keys]
     return map(lambda x: torch.stack(x, dim=0), data)
 def normalize(self, keys):
     for key in keys:
         k = torch.stack(getattr(self, key))
         k = (k - k.mean()) / (k.std() + 1e-10)
         setattr(self, key, [i for i in k])
Beispiel #43
0
def get_tp_fp_fn_tn(net_output, gt, axes=None, mask=None, square=False):
    """
    net_output must be (b, c, x, y(, z)))
    gt must be a label map (shape (b, 1, x, y(, z)) OR shape (b, x, y(, z))) or one hot encoding (b, c, x, y(, z))
    if mask is provided it must have shape (b, 1, x, y(, z)))
    :param net_output:
    :param gt:
    :param axes: can be (, ) = no summation
    :param mask: mask must be 1 for valid pixels and 0 for invalid pixels
    :param square: if True then fp, tp and fn will be squared before summation
    :return:
    """
    if axes is None:
        axes = tuple(range(2, len(net_output.size())))

    shp_x = net_output.shape
    shp_y = gt.shape

    with torch.no_grad():
        if len(shp_x) != len(shp_y):
            gt = gt.view((shp_y[0], 1, *shp_y[1:]))

        if all([i == j for i, j in zip(net_output.shape, gt.shape)]):
            # if this is the case then gt is probably already a one hot encoding
            y_onehot = gt
        else:
            gt = gt.long()
            y_onehot = torch.zeros(shp_x)
            if net_output.device.type == "cuda":
                y_onehot = y_onehot.cuda(net_output.device.index)
            y_onehot.scatter_(1, gt, 1)

    tp = net_output * y_onehot
    fp = net_output * (1 - y_onehot)
    fn = (1 - net_output) * y_onehot
    tn = (1 - net_output) * (1 - y_onehot)

    if mask is not None:
        tp = torch.stack(tuple(x_i * mask[:, 0]
                               for x_i in torch.unbind(tp, dim=1)),
                         dim=1)
        fp = torch.stack(tuple(x_i * mask[:, 0]
                               for x_i in torch.unbind(fp, dim=1)),
                         dim=1)
        fn = torch.stack(tuple(x_i * mask[:, 0]
                               for x_i in torch.unbind(fn, dim=1)),
                         dim=1)
        tn = torch.stack(tuple(x_i * mask[:, 0]
                               for x_i in torch.unbind(tn, dim=1)),
                         dim=1)

    if square:
        tp = tp**2
        fp = fp**2
        fn = fn**2
        tn = tn**2

    if len(axes) > 0:
        tp = sum_tensor(tp, axes, keepdim=False)
        fp = sum_tensor(fp, axes, keepdim=False)
        fn = sum_tensor(fn, axes, keepdim=False)
        tn = sum_tensor(tn, axes, keepdim=False)

    return tp, fp, fn, tn
Beispiel #44
0
    def forward(self,
                support_x,
                support_y,
                query_x,
                query_y,
                train=True,
                n_way=-1,
                curr_shot=-1):

        batch_sz, support_sz, _, h, w = support_x.size()
        query_sz = query_x.size(1)

        # FEATURE EXTRACTION
        support_x = self.repnet(support_x.view(batch_sz * support_sz, -1, h,
                                               w))
        query_x = self.repnet(query_x.view(batch_sz * query_sz, -1, h, w))
        # output [b, support_sz / query_sz, c, d, d]
        support_xf = support_x.view(batch_sz, support_sz, self.c, self.d,
                                    self.d)
        query_xf = query_x.view(batch_sz, query_sz, self.c, self.d, self.d)

        # SWAP
        if self.swap and train:
            support_xfs, query_xfs, support_ys, query_ys = \
                self._generate_multiple(support_xf, query_xf, support_y, query_y, n_way)
        else:
            # also for test
            support_xfs, query_xfs, support_ys, query_ys = \
                [support_xf], [query_xf], [support_y], [query_y]
        # SCORE
        expand_sz = n_way if self.opts.model.sum_supp_sample else support_sz
        score = torch.zeros(len(support_xfs), batch_sz, query_sz,
                            expand_sz).to(self.opts.ctrl.device)
        for i in range(len(support_xfs)):
            # expand both to [b, query_sz, support_sz/n_way, c, d, d]
            if self.opts.model.sum_supp_sample:
                support_xf = torch.sum(
                    support_xfs[i].view(batch_sz, n_way, -1, self.c, self.d,
                                        self.d), 2).squeeze(2)
            else:
                support_xf = support_xfs[i]
            support_xf = support_xf.unsqueeze(1).expand(
                -1, query_sz, -1, -1, -1, -1)
            query_xf = query_xfs[i].unsqueeze(2).expand(
                -1, -1, expand_sz, -1, -1, -1)

            # cat => [b, query_sz, support_sz/n_way, 2c, d, d]
            comb = torch.cat([support_xf, query_xf], dim=3)
            comb = comb.view(batch_sz * query_sz * expand_sz, 2 * self.c,
                             self.d, self.d)
            comb = self.relation2(self.relation1(comb))
            comb = F.avg_pool2d(comb, self.pool_size)
            # [b*query_sz*support_sz/n_way, 256] => [b, query_sz, support_sz/n_way, 1]
            # score: [b, query_sz, support_sz/n_way]
            score[i] = self.fc(comb.view(batch_sz * query_sz * expand_sz,
                                         -1)).view(batch_sz, query_sz,
                                                   expand_sz, 1).squeeze(3)

        # LOSS OR ACCURACY
        if train:
            loss = torch.zeros(len(support_xfs)).to(self.opts.ctrl.device)
            for i in range(len(support_xfs)):

                if self.CE_loss:
                    # reformat score output: N, n_way (being the number of classes)
                    curr_score = score[i].view(batch_sz * query_sz, n_way,
                                               -1).mean(dim=-1)
                    support_y_neat = support_ys[i][:, ::curr_shot]  # b, n_way
                    target = torch.stack([
                        torch.nonzero(
                            torch.eq(support_y_neat[b], query_ys[i][b, j]))
                        for b, query in enumerate(query_ys[i])
                        for j, _, in enumerate(query)
                    ])
                    target = target.view(-1)  # shape: N
                    loss[i] = F.cross_entropy(curr_score, target)
                else:
                    # build the label
                    if self.opts.model.sum_supp_sample:
                        support_y_neat = support_ys[i][:, ::
                                                       curr_shot]  # b, n_way
                        support_y_expand = support_y_neat.unsqueeze(1).expand(
                            batch_sz, query_sz, n_way)
                        query_y_expand = query_ys[i].unsqueeze(2).expand(
                            batch_sz, query_sz, n_way)
                    else:
                        # [b, support_sz] => [b, 1, support_sz] => [b, query_sz, support_sz]
                        support_y_expand = support_ys[i].unsqueeze(1).expand(
                            batch_sz, query_sz, support_sz)
                        # [b, query_sz] => [b, query_sz, 1] => [b, query_sz, support_sz]
                        query_y_expand = query_ys[i].unsqueeze(2).expand(
                            batch_sz, query_sz, support_sz)

                    # convert byte tensor to float tensor
                    label = torch.eq(support_y_expand, query_y_expand).float()
                    loss[i] = F.mse_loss(score[i], label)

                loss = (loss.sum() / len(support_xfs)).unsqueeze(0)

            return loss.unsqueeze(
                0)  # output size: 1 x 1 (or the number of losses)
        else:
            # TEST
            if self.opts.model.sum_supp_sample:
                score = score[0].unsqueeze(-1)
            else:
                # score shape: b, query_sz, n_way, k_shot
                score = score[0].view(batch_sz, query_sz, n_way, curr_shot)
            # pred_ind shape: b, query_sz
            if self.CE_loss:
                pred_ind = score.mean(dim=-1).argmax(dim=-1)
            else:
                pred_ind = score.sum(dim=-1).argmax(dim=-1)

            support_y_neat = support_ys[0][:, ::curr_shot]  # b, n_way
            pred = torch.stack([
                support_y_neat[b, ind] for b, query in enumerate(pred_ind)
                for ind in query
            ])
            pred = pred.view(batch_sz, -1)

            correct = torch.eq(pred, query_ys[0]).sum()
            correct = correct.unsqueeze(0)
            return pred, correct
Beispiel #45
0
            print('completed %epoch ', inter)
            inter += .10

        data = []
        result = []
        for sm in sample:
            imag, dat, res = sm
            data = dat
            result.append(res)

        for img in imag:
            preds = model.predict(img)
            preds = preds.astype('float').reshape(-1)
            preds = preds[0]

        target = torch.stack(result)
        target = target.view(-1)

        final_vars = []
        final_vars = torch.FloatTensor([[[abs(data - preds)]]])
        x = net(*final_vars)
        values, indices = x.max(1)
        loss = criterion(x, Variable(target))
        loss.backward()
        optimizer.step()
        net.zero_grad()

        if (i_batch > 10):
            break

    # &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&TESTING &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Beispiel #46
0
    def forward(self, x, temp=1):

        bs, N, T, ch, h, w = x.shape
        x = x.flatten(0, 1)

        if self.feature_dropout_prob > 0: # Note: Apply to q,k?
            x_att = self.feat_dropout(x)
        else:
            x_att = x

        # TODO: Check if normalization should also be done for the Value features. Maybe it doesn't matter.
        x_att = F.normalize(x_att, p=2, dim=-3) # Note: Layernorm

        # Option 1
        # qs = self.to_q(x_att[:, 1:].flatten(0, 1)).flatten(-2, -1) # [bsNT, ch, hw]
        # ks = self.to_k(x_att[:, :-1].flatten(0, 1)).flatten(-2, -1)
        # Option 2
        qs, ks = x_att[:, 1:].flatten(0, 1).flatten(-2, -1), x_att[:, :-1].flatten(0, 1).flatten(-2, -1)

        # Feature linear transformation + stacked positional encoding
        vs_pe = self.to_v(x.flatten(0, 1)).reshape(bs*N, T, -1, h*w)

        # As = self.affinity(ks, qs) # We track backwards!
        energy = torch.einsum('bcn,bcm->bnm', qs, ks).reshape(bs*N, T-1, h*w, h*w) * self.scale
        As = [self.stoch_mat(energy[:, t], temp=temp, do_dropout=True) for t in range(T-1)]

        # vs_list = torch.split(vs_pe, 1, dim=1)

        acc_state = vs_pe[:, :-self.n_timesteps+1, self.s_sta_dim:] # Note: Make sure it keeps the temporal encoding
        attn_vec = torch.stack(As, dim=1)
        for t in range(self.n_timesteps-1):
            if t + 2 < self.n_timesteps:
                curr_state, curr_attn = vs_pe[:, t+1:-self.n_timesteps+2+t, self.s_sta_dim:], attn_vec[:, t:-self.n_timesteps+2+t]
            else: curr_state, curr_attn = vs_pe[:, t+1:], attn_vec[:, t:]
            #  attn_vec has one less timestep, so the range is slightly different.

            acc_state = torch.cat([curr_state, torch.einsum('btcm,btnm->btcn', acc_state, curr_attn)], dim=2)

            # Note: IGNORE. For testing purposes: reconstruct t without (t) features
            # if t + 2 < self.n_timesteps:
            #     acc_state = torch.cat([curr_state, torch.einsum('btcm,btnm->btcn', acc_state, curr_attn)], dim=2)
            # else:
            #     acc_state = torch.einsum('btcm,btnm->btcn', acc_state, curr_attn)

        # # Note: IGNORE. For testing purposes: Simple version of the function
        # acc_state = vs_list[0]
        # for t in range(T-1):
        #
        #     if t < T-3: # Note: Test attention. If we reverse the dimensionality it works poorly. Difficult to know why
        #         acc_state = torch.cat([vs_list[t+1], torch.einsum('bcm,bnm->bcn', acc_state.squeeze(1), As[t]).unsqueeze(1)], dim=2)
        #     else:
        #         acc_state = torch.einsum('bcm,bnm->bcn', acc_state.squeeze(1), As[t]).unsqueeze(1)

        # Option: Self-attention from SAGAN
        # m_batchsize,C,width ,height = x.size()
        # proj_query  = self.query_conv(x).view(m_batchsize,-1,width*height).permute(0,2,1) # B X CX(N)
        # proj_key =  self.key_conv(x).view(m_batchsize,-1,width*height) # B X C x (*W*H)
        # energy =  torch.bmm(proj_query,proj_key) # transpose check
        # attention = self.softmax(energy) # BX (N) X (N)
        # proj_value = self.value_conv(x).view(m_batchsize,-1,width*height) # B X C X N
        #
        # out = torch.bmm(proj_value,attention.permute(0,2,1) )
        # out = out.view(m_batchsize,C,width,height)
        # out = self.gamma*out + x

        return acc_state
Beispiel #47
0
 def validation_end(self, outputs):
     avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
     #         tensorboard_logs = {"Validation/Loss": avg_loss}
     return {"avg_val_loss": avg_loss}  # "log": tensorboard_logs
Beispiel #48
0
 def forward(self, xh, xp_list):
     xp_att_list = [self.node_att(xp) for xp in xp_list]
     com_att = torch.max(torch.stack(xp_att_list, dim=1), dim=1, keepdim=False)[0]
     xph_message = sum([self.conv_ch(torch.cat([xh, xp*com_att], dim=1)) for xp in xp_list])
     return xph_message
cut_size = 44
total_epoch = 250

path = os.path.join(opt.dataset + '_' + opt.model)

# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(44),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.TenCrop(cut_size),
    transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
])

trainset = FER2013(split = 'Training', transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=opt.bs, shuffle=True, num_workers=1)
PublicTestset = FER2013(split = 'PublicTest', transform=transform_test)
PublicTestloader = torch.utils.data.DataLoader(PublicTestset, batch_size=opt.bs, shuffle=False, num_workers=1)
PrivateTestset = FER2013(split = 'PrivateTest', transform=transform_test)
PrivateTestloader = torch.utils.data.DataLoader(PrivateTestset, batch_size=opt.bs, shuffle=False, num_workers=1)

# Model
if opt.model == 'VGG19':
    net = VGG('VGG19')
elif opt.model  == 'Resnet18':
    net = ResNet18()
Beispiel #50
0
def initProb(sData, nTrain, nVal, var0,  alph, cvt):
    """
    initialize the OC problem that we want to solve

    :param sData:  str, name of the problem
    :param nTrain: int, number of samples in a batch, drawn from rho_0
    :param nVal:   int, number of validation samples to draw from rho_0
    :param var0: float, variance of rho_0
    :param alph:  list, 6-value list of parameters/hyperparameters
    :param cvt:   func, conversion function for typing and device
    :return:
        prob:  the problem Object
        x0:    nTrain -by- d tensor, training batch
        x0v:   nVal -by- d tensor, training batch
        xInit: 1 -by- d tensor, center of rho_0
    """
    if sData == 'softcorridor':
        d = 4
        xtarget = cvt(torch.tensor([[2, 2, -2, 2]]))
        xInit   = cvt(torch.tensor([[-2, -2, 2, -2]]))
        x0      = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v     = xInit + cvt(var0 * torch.randn(nTrain, d))
        prob    = Cross2D(xtarget, obstacle='softcorridor', alph_Q=alph[1], alph_W=alph[2], r=0.5)

    elif sData == 'swarm':

        nAgents = 32
        d = nAgents*3
        xtarget = cvt(torch.tensor([-2., 2., 8.,
                                    -1., 2., 8.,
                                     0., 2., 8.,
                                     1., 2., 8.,
                                     2., 2., 8.,
                                    -2.5, 3., 8.,
                                    -1.5, 3., 8.,
                                    -0.5, 3., 8.,
                                     0.5, 3., 8.,
                                     1.5, 3., 8.,
                                     2.5, 3., 8.,
                                    -2., 4., 8.,
                                    -1., 4., 8.,
                                     0., 4., 8.,
                                     1., 4., 8.,
                                     2., 4., 8.]))

        xtarget = torch.cat((xtarget.view(-1, 3), cvt(torch.tensor([0, -0.5, -3])) + xtarget.view(-1, 3)), dim=0).view(-1)


        halfTrain = nTrain // 2

        xInit = cvt(torch.tensor([1,-1,-1])) * xtarget.view(-1,3) + cvt(torch.tensor([0,0,10]))
        xInit = xInit.view(1,-1)

        x0     = xInit  + cvt( var0 * torch.randn(halfTrain, d))
        xmore  = xtarget + cvt(var0 * torch.randn(halfTrain, d))
        x0 = torch.cat((x0, xmore), dim=0)


        # validation samples from rho_0
        x0v     = xInit  + cvt( var0 * torch.randn(halfTrain, d))
        for j in range(nAgents):
            x0[ :,3*j+3:3*(j+1)] = 0. *  x0[ :,3*j+3:3*(j+1)]
            x0v[:,3*j+3:3*(j+1)] = 0. *  x0v[:,3*j+3:3*(j+1)]

        prob = SwarmTraj(xtarget, obstacle='blocks', alph_Q=alph[1],  alph_W=alph[2], r= 0.2)


    elif sData == 'swarm50':

        nAgents = 50
        d = nAgents*3

        xtarget = cvt(torch.tensor([-2., 2., 6.,
                                    -1., 2., 6.,
                                     0., 2., 6.,
                                     1., 2., 6.,
                                     2., 2., 6.,
                                     3., 2., 6.,
                                     4., 2., 6.,
                                    -2.5, 3., 7.,
                                    -1.5, 3., 7.,
                                    -0.5, 3., 7.,
                                     0.5, 3., 7.,
                                     1.5, 3., 7.,
                                     2.5, 3., 7.,
                                     3.5, 3., 7.,
                                    -2., 4., 8.,
                                    -1., 4., 8.,
                                     0., 4., 8.,
                                     1., 4., 8.,
                                     2., 4., 8.,
                                     3., 4., 8.,
                                     4., 4., 8.,
                                    -2., 3., 5.,
                                    -1., 3., 5.,
                                     1., 3., 5.,
                                     2., 3., 5.]))


        xtarget = torch.cat((xtarget.view(-1, 3), cvt(torch.tensor([0, -0.5, -3])) + xtarget.view(-1, 3)), dim=0).view(-1)

        halfTrain = nTrain // 2

        xInit = cvt(torch.tensor([1,-1,-1])) * xtarget.view(-1,3) + cvt(torch.tensor([0,0,10]))
        xInit = xInit.view(1,-1)

        x0     = xInit  + cvt( var0 * torch.randn(halfTrain, d))
        xmore  = xtarget + cvt(var0 * torch.randn(halfTrain, d))
        x0 = torch.cat((x0, xmore), dim=0)

        # validation samples from rho_0
        x0v     = xInit  + cvt( var0 * torch.randn(halfTrain, d))
        for j in range(nAgents):
            x0[ :,3*j+3:3*(j+1)] = 0. *  x0[ :,3*j+3:3*(j+1)]
            x0v[:,3*j+3:3*(j+1)] = 0. *  x0v[:,3*j+3:3*(j+1)]

        prob = SwarmTraj(xtarget, obstacle='blocks', alph_Q=alph[1],  alph_W=alph[2], r= 0.1)

    elif sData == 'singlequad':

        d = 12
        xtarget = cvt(torch.tensor([2., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

        init = -1.5
        xInit  = cvt(torch.tensor([init, init, init]))
        x0 = xInit + cvt(var0 * torch.randn(nTrain, 3))
        x0 = pad(x0, [0, d - 3, 0, 0], value=0)
        xInit = pad(xInit.view(1,-1), [0, d - 3, 0, 0], value=0)

        # validation samples from rho_0
        x0v = cvt(torch.tensor([init, init, init]) + var0 * torch.randn(nVal, 3))
        x0v = pad(x0v, [0, d - 3, 0, 0], value=0)

        prob = Quadcopter(xtarget,obstacle=None, alph_Q = 0.0, alph_W = 0.0)

    elif sData=='midcross2':
        nAgents = 2
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([2,2,-2,2]))
        xInit   = cvt(torch.tensor([-2,-2,2,-2])).view(1,-1)
        x0      = cvt(torch.tensor([-2,-2,2,-2]) + var0 * torch.randn(nTrain, d))
        x0v     = cvt(torch.tensor([-2,-2,2,-2]) + var0 * torch.randn(nTrain, d))
        prob    = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2])

    elif sData == 'midcross4':
        nAgents = 4
        d = 2 * nAgents
        xx = torch.linspace(-2, 2, nAgents)
        xtarget = cvt(torch.stack((xx.flip(dims=[0]), 2 * torch.ones(nAgents)), dim=1).reshape(1,-1))
        xInit = cvt(torch.stack((xx, -2 * torch.ones(nAgents)), dim=1).reshape(1,-1))

        x0  = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.4)
    elif sData == 'midcross20':
        nAgents = 20
        d = 2 * nAgents
        xx = torch.linspace(-6, 6, nAgents)
        xtarget = cvt(torch.stack((xx.flip(dims=[0]), 6 * torch.ones(nAgents)), dim=1).reshape(1, -1))
        xInit = cvt(torch.stack((xx, -6 * torch.ones(nAgents)), dim=1).reshape(1, -1))

        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.15)
    elif sData == 'midcross30':
        nAgents = 30
        d = 2 * nAgents
        xx = torch.linspace(-6, 6, nAgents) 
        tmp = torch.tensor([6,4,2])
        tmp = tmp.view(-1,1).repeat(nAgents//3,1).view(-1)
        xtarget = cvt(torch.stack((xx.flip(dims=[0]), tmp), dim=1).reshape(1, -1))
        
        tmp = torch.tensor([-6,-4,-2])
        tmp = tmp.view(-1,1).repeat(nAgents//3,1).view(-1)
        xInit = cvt(torch.stack((xx, tmp), dim=1).reshape(1, -1))

        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.2)
    elif sData == 'swap2':
        nAgents = 2
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([10., 0., -10., 0.]))
        xInit = cvt(torch.tensor([-10., 0., 10., 0.])).reshape(1, -1)
        x0  = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle='hardcorridor', alph_Q=alph[1], alph_W=alph[2], r=1.0)
    elif sData == 'swap12':
        nAgents = 12
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0,  10,0, -10,0,   5,5, -5,-5,  -4,2, -6,-1,   5,-5, -5,5,   2,-2, -2,-2 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2,  -10,0, 10,0,   -5,-5, 5,5,  -6,-1, -4,2,  -5,5, 5,-5,  -2,-2, 2,-2 ])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    elif sData == 'swap12_5pair':
        nAgents = 10
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0,  10,0, -10,0,   5,5, -5,-5,  -4,2, -6,-1,   5,-5, -5,5 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2,  -10,0, 10,0,   -5,-5, 5,5,  -6,-1, -4,2,  -5,5, 5,-5 ])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    elif sData == 'swap12_4pair':
        nAgents = 8
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0,  10,0, -10,0,   5,5, -5,-5,  -4,2, -6,-1 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2,  -10,0, 10,0,   -5,-5, 5,5,  -6,-1, -4,2])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    elif sData == 'swap12_3pair':
        nAgents = 6
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0,  10,0, -10,0,   5,5, -5,-5 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2,  -10,0, 10,0,   -5,-5, 5,5 ])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    elif sData == 'swap12_2pair':
        nAgents = 4
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0,  10,0, -10,0 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2,  -10,0, 10,0 ])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    elif sData == 'swap12_1pair':
        nAgents = 2
        d = 2 * nAgents
        xtarget = cvt(torch.tensor([ 2,2, 0,0 ]))
        xInit   = cvt(torch.tensor([0,0, 2,2 ])).reshape(1,-1)
        x0 = xInit + cvt(var0 * torch.randn(nTrain, d))
        x0v = xInit + cvt(var0 * torch.randn(nVal, d))
        prob = Cross2D(xtarget, obstacle=None, alph_Q=alph[1], alph_W=alph[2], r=0.5)
    else:
        print("incorrect value passed to --data")
        exit(1)


    return prob, x0, x0v, xInit
Beispiel #51
0
    def compute_JacobianM(self, pts2d1, pts2d2, pts2d1_normed, pts2d2_normed, fixM1, fixM2, intrinsic, t, ang, lagr):
        R = self.rot_from_axisangle(ang)
        T = self.t2T(t)

        derT0, derT1, derT2 = self.derivative_translate(intrinsic.device)
        rotxd, rotyd, rotzd = self.derivative_angle(ang)

        samplenum = pts2d1.shape[1]

        r_bias = (torch.norm(t) - 1)
        J_t0_bias = 2 * lagr * r_bias / torch.norm(t) * t[0]
        J_t1_bias = 2 * lagr * r_bias / torch.norm(t) * t[1]
        J_t2_bias = 2 * lagr * r_bias / torch.norm(t) * t[2]

        ## ============Compute DerivM2============ ##
        planeparam2 = torch.inverse(fixM2 @ intrinsic).T @ T @ R @ torch.inverse(intrinsic) @ pts2d1
        planeparam2_norm = torch.norm(planeparam2, dim=0, keepdim=True)
        planeparam2_normed = planeparam2 / planeparam2_norm
        rdist_2 = torch.sum(planeparam2_normed * pts2d2_normed, dim=0, keepdim=True)

        deriv_tonorm2 = 2 * rdist_2 * pts2d2_normed
        dtonx2, dtony2, dtonz2 = torch.split(deriv_tonorm2, 1, dim=0)
        px2, py2, pz2 = torch.split(planeparam2, 1, dim=0)

        deriv_px2 = dtonx2 / planeparam2_norm - torch.sum(px2 * planeparam2_normed * deriv_tonorm2, dim=0, keepdim=True) / (planeparam2_norm ** 2)
        deriv_py2 = dtony2 / planeparam2_norm - torch.sum(py2 * planeparam2_normed * deriv_tonorm2, dim=0, keepdim=True) / (planeparam2_norm ** 2)
        deriv_pz2 = dtonz2 / planeparam2_norm - torch.sum(pz2 * planeparam2_normed * deriv_tonorm2, dim=0, keepdim=True) / (planeparam2_norm ** 2)
        deriv_norm2 = torch.cat([deriv_px2, deriv_py2, deriv_pz2], dim=0)
        deriv_M2 = (deriv_norm2.T).unsqueeze(2) @ (pts2d1.T).unsqueeze(1) @ torch.inverse(intrinsic).T
        deriv_M2 = (torch.inverse(fixM2 @ intrinsic)).unsqueeze(0).expand([samplenum, -1, -1]) @ deriv_M2

        ## ============Compute DerivM1============ ##
        planeparam1 = (pts2d2.T @ torch.inverse(intrinsic).T @ T @ R @ torch.inverse(fixM1 @ intrinsic)).T
        planeparam1_norm = torch.norm(planeparam1, dim=0, keepdim=True)
        planeparam1_normed = planeparam1 / planeparam1_norm
        rdist_1 = torch.sum(planeparam1_normed * pts2d1_normed, dim=0, keepdim=True)

        deriv_tonorm1 = 2 * rdist_1 * pts2d1_normed
        dtonx1, dtony1, dtonz1 = torch.split(deriv_tonorm1, 1, dim=0)
        px1, py1, pz1 = torch.split(planeparam1, 1, dim=0)

        deriv_px1 = dtonx1 / planeparam1_norm - torch.sum(px1 * planeparam1_normed * deriv_tonorm1, dim=0, keepdim=True) / (planeparam1_norm ** 2)
        deriv_py1 = dtony1 / planeparam1_norm - torch.sum(py1 * planeparam1_normed * deriv_tonorm1, dim=0, keepdim=True) / (planeparam1_norm ** 2)
        deriv_pz1 = dtonz1 / planeparam1_norm - torch.sum(pz1 * planeparam1_normed * deriv_tonorm1, dim=0, keepdim=True) / (planeparam1_norm ** 2)
        deriv_norm1 = torch.cat([deriv_px1, deriv_py1, deriv_pz1], dim=0)
        deriv_M1 = (pts2d2.T).unsqueeze(2) @ (deriv_norm1.T).unsqueeze(1)
        deriv_M1 = deriv_M1 @ torch.inverse(fixM1 @ intrinsic).T
        deriv_M1 = (torch.inverse(intrinsic)).unsqueeze(0).expand([samplenum, -1, -1]) @ deriv_M1

        ## ============== ##
        J_t0 = torch.sum((deriv_M2 + deriv_M1) * (derT0 @ R), dim=[1, 2]) / samplenum + J_t0_bias / samplenum
        J_t1 = torch.sum((deriv_M2 + deriv_M1) * (derT1 @ R), dim=[1, 2]) / samplenum + J_t1_bias / samplenum
        J_t2 = torch.sum((deriv_M2 + deriv_M1) * (derT2 @ R), dim=[1, 2]) / samplenum + J_t2_bias / samplenum

        J_ang0 = torch.sum((deriv_M2 + deriv_M1) * (T @ rotxd), dim=[1, 2]) / samplenum
        J_ang1 = torch.sum((deriv_M2 + deriv_M1) * (T @ rotyd), dim=[1, 2]) / samplenum
        J_ang2 = torch.sum((deriv_M2 + deriv_M1) * (T @ rotzd), dim=[1, 2]) / samplenum

        JacobM = torch.stack([J_ang0, J_ang1, J_ang2, J_t0, J_t1, J_t2], dim=1)
        residual = (rdist_2 ** 2 / samplenum + rdist_1 ** 2 / samplenum + lagr * r_bias ** 2 / samplenum)
        return JacobM, residual.T
def fit_tau(X_t,X_tp1, tau, tau_t, vee, opt_tau, opt_vee, device, epoch):
    """
    Meant to be called on a batch of states to estimate the batch gradient wrt tau and v.
    """
    alpha_t = 1. / np.sqrt(epoch+1)
    lam = 10

    tau_xt = tau(X_t)
    tau_xtp1 = tau(X_tp1)
    tau_t_xt = tau_t(X_t)
    tau_t_xtp1 = tau_t(X_tp1)
    v = vee

    grad_theta_tau_xt, grad_theta_tau_xtp1 = defaultdict(list),defaultdict(list)

    for i in range(len(X_t)):
        opt_tau.zero_grad()
        tau_xt.backward([torch.FloatTensor([[1] if i==j else [0] for j in range(len(tau_xt))]).to(device)],retain_graph=True)

        for param in tau.named_parameters():
            grad_theta_tau_xt[param[0]].append(param[1].grad.clone())

        opt_tau.zero_grad()
        tau_xtp1.backward([torch.FloatTensor([[1] if i==j else [0] for j in range(len(tau_xtp1))]).to(device)],retain_graph=True)
        for param in tau.named_parameters():
            grad_theta_tau_xtp1[param[0]].append(param[1].grad.clone())
        
    opt_tau.zero_grad()
    opt_vee.zero_grad()

    avg_grad_J_tau = []
    avg_grad_J_v = []

    for param in tau.named_parameters():
        """
        grad_theta: n_batch x n_out x n_in (matrix)
                    n_batch x n_out (bias)
        """
        grad_theta_tau_xt_MAT = torch.stack(grad_theta_tau_xt[param[0]])
        grad_theta_tau_xtp1_MAT = torch.stack(grad_theta_tau_xtp1[param[0]])

        """
        Defined both gradients as in Eq.17
        """

        if len(grad_theta_tau_xt_MAT.shape) == 3: # Matrix
            tiled_tau_xt =  tau_xt.repeat(grad_theta_tau_xt_MAT.shape[1],1,grad_theta_tau_xt_MAT.shape[2]).permute(1,0,2)
            tiled_tau_xtp1 =  tau_xtp1.repeat(grad_theta_tau_xtp1_MAT.shape[1],1,grad_theta_tau_xtp1_MAT.shape[2]).permute(1,0,2)
            tiled_tau_t_xt =  tau_t_xt.repeat(grad_theta_tau_xt_MAT.shape[1],1,grad_theta_tau_xt_MAT.shape[2]).permute(1,0,2)
            tiled_tau_t_xtp1 =  tau_t_xtp1.repeat(grad_theta_tau_xtp1_MAT.shape[1],1,grad_theta_tau_xtp1_MAT.shape[2]).permute(1,0,2)
        else: # Bias
            tiled_tau_xt =  tau_xt.repeat(1,grad_theta_tau_xt_MAT.shape[1])
            tiled_tau_xtp1 =  tau_xtp1.repeat(1,grad_theta_tau_xtp1_MAT.shape[1])
            tiled_tau_t_xt =  tau_t_xt.repeat(1,grad_theta_tau_xt_MAT.shape[1])
            tiled_tau_t_xtp1 =  tau_t_xtp1.repeat(1,grad_theta_tau_xtp1_MAT.shape[1])

        grad_J_tau = (tiled_tau_xt * grad_theta_tau_xt_MAT).mean(0) - (1 - alpha_t) * (tiled_tau_t_xt * grad_theta_tau_xt_MAT).mean(0) - alpha_t * (tiled_tau_t_xtp1 * grad_theta_tau_xtp1_MAT).mean(0) + 2*lam*v*grad_theta_tau_xt_MAT.mean(0)
        grad_J_v = - (2 * lam * (tau_xt.mean() - 1 - v))
        param[1].grad = grad_J_tau
        vee.grad = grad_J_v

        avg_grad_J_tau.append( grad_J_tau.mean().item() )
        avg_grad_J_v.append( grad_J_v.mean().item() )
        

    opt_tau.step()
    opt_vee.step()

    tau_t.load_state_dict(tau.state_dict())

    return np.mean(avg_grad_J_tau), np.mean(avg_grad_J_v)
 def validation_epoch_end(self, outputs):
     acc = torch.stack([x['acc'] for x in outputs]).mean()
     val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
     tensorboard_logs = {'val_ce_loss': val_loss, 'val_acc': acc}
     progress_bar_metrics = tensorboard_logs
     return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
 def test_epoch_end(self, outputs):
     acc = torch.stack([x['acc'] for x in outputs]).mean()
     test_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
     tensorboard_logs = {'test_ce_loss': test_loss, 'test_acc': acc}
     progress_bar_metrics = tensorboard_logs
     return {'test_loss': test_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
Beispiel #55
0
    def forward(self, img, save_path=None, return_prob=False):
        """Run MTCNN face detection on a PIL image. This method performs both detection and
        extraction of faces, returning tensors representing detected faces rather than the bounding
        boxes. To access bounding boxes, see the MTCNN.detect() method below.

        Arguments:
            img {PIL.Image} -- A PIL image.

        Keyword Arguments:
            save_path {str} -- An optional save path for the cropped image. Note that when
                self.prewhiten=True, although the returned tensor is prewhitened, the saved face
                image is not, so it is a true representation of the face in the input image.
                (default: {None})
            return_prob {bool} -- Whether or not to return the detection probability.
                (default: {False})

        Returns:
            Union[torch.Tensor, tuple(torch.tensor, float)] -- If detected, cropped image of a face
                with dimensions 3 x image_size x image_size. Optionally, the probability that a
                face was detected. If self.keep_all is True, n detected faces are returned in an
                n x 3 x image_size x image_size tensor with an optional list of detection
                probabilities.
        Example:
        >>> from facenet_pytorch import MTCNN
        >>> mtcnn = MTCNN()
        >>> face_tensor, prob = mtcnn(img, save_path='face.png', return_prob=True)
        """

        with torch.no_grad():
            boxes, probs = self.detect(img)

            if boxes is None:
                if return_prob:
                    return None, [None] if self.keep_all else None
                else:
                    return None

            if not self.keep_all:
                boxes = boxes[[0]]

            faces = []
            for i, box in enumerate(boxes):
                face_path = save_path
                if save_path is not None and i > 0:
                    save_name, ext = os.path.splitext(save_path)
                    face_path = save_name + '_' + str(i + 1) + ext

                face = extract_face(img, box, self.image_size, self.margin, face_path)
                if self.prewhiten:
                    face = prewhiten(face)
                faces.append(face)

            if self.keep_all:
                faces = torch.stack(faces)
            else:
                faces = faces[0]
                probs = probs[0]

            if return_prob:
                return faces, probs
            else:
                return faces
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}
Beispiel #57
0
    def get_protein_foreground_probability(
        self,
        adata: Optional[AnnData] = None,
        indices: Optional[Sequence[int]] = None,
        transform_batch: Optional[Sequence[Union[Number, str]]] = None,
        protein_list: Optional[Sequence[str]] = None,
        n_samples: int = 1,
        batch_size: Optional[int] = None,
        return_mean: bool = True,
        return_numpy: Optional[bool] = None,
    ):
        r"""
        Returns the foreground probability for proteins.

        This is denoted as :math:`(1 - \pi_{nt})` in the totalVI paper.

        Parameters
        ----------
        adata
            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
            AnnData object used to initialize the model.
        indices
            Indices of cells in adata to use. If `None`, all cells are used.
        transform_batch
            Batch to condition on.
            If transform_batch is:

            - None, then real observed batch is used
            - int, then batch transform_batch is used
            - List[int], then average over batches in list
        protein_list
            Return protein expression for a subset of genes.
            This can save memory when working with large datasets and few genes are
            of interest.
        n_samples
            Number of posterior samples to use for estimation.
        batch_size
            Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`.
        return_mean
            Whether to return the mean of the samples.
        return_numpy
            Return a :class:`~numpy.ndarray` instead of a :class:`~pandas.DataFrame`. DataFrame includes
            gene names as columns. If either `n_samples=1` or `return_mean=True`, defaults to `False`.
            Otherwise, it defaults to `True`.

        Returns
        -------
        - **foreground_probability** - probability foreground for each protein

        If `n_samples` > 1 and `return_mean` is False, then the shape is `(samples, cells, genes)`.
        Otherwise, shape is `(cells, genes)`. In this case, return type is :class:`~pandas.DataFrame` unless `return_numpy` is True.
        """
        adata = self._validate_anndata(adata)
        post = self._make_scvi_dl(adata=adata,
                                  indices=indices,
                                  batch_size=batch_size)

        if protein_list is None:
            protein_mask = slice(None)
        else:
            all_proteins = adata.uns["scvi_protein_names"]
            protein_mask = [
                True if p in protein_list else False for p in all_proteins
            ]

        if n_samples > 1 and return_mean is False:
            if return_numpy is False:
                logger.warning(
                    "return_numpy must be True if n_samples > 1 and return_mean is False, returning np.ndarray"
                )
            return_numpy = True
        if indices is None:
            indices = np.arange(adata.n_obs)

        py_mixings = []
        if not isinstance(transform_batch, IterableClass):
            transform_batch = [transform_batch]

        transform_batch = _get_batch_code_from_category(adata, transform_batch)
        for tensors in post:
            x = tensors[_CONSTANTS.X_KEY]
            y = tensors[_CONSTANTS.PROTEIN_EXP_KEY]
            batch_index = tensors[_CONSTANTS.BATCH_KEY]
            label = tensors[_CONSTANTS.LABELS_KEY]
            py_mixing = torch.zeros_like(y[..., protein_mask])
            if n_samples > 1:
                py_mixing = torch.stack(n_samples * [py_mixing])
            for b in transform_batch:
                outputs = self.model.inference(
                    x,
                    y,
                    batch_index=batch_index,
                    label=label,
                    n_samples=n_samples,
                    transform_batch=b,
                )
                py_mixing += torch.sigmoid(
                    outputs["py_"]["mixing"])[..., protein_mask]
            py_mixing /= len(transform_batch)
            py_mixings += [py_mixing.cpu()]
        if n_samples > 1:
            # concatenate along batch dimension -> result shape = (samples, cells, features)
            py_mixings = torch.cat(py_mixings, dim=1)
            # (cells, features, samples)
            py_mixings = py_mixings.permute(1, 2, 0)
        else:
            py_mixings = torch.cat(py_mixings, dim=0)

        if return_mean is True and n_samples > 1:
            py_mixings = torch.mean(py_mixings, dim=-1)

        py_mixings = py_mixings.cpu().numpy()

        if return_numpy is True:
            return 1 - py_mixings
        else:
            pro_names = self.adata.uns["scvi_protein_names"]
            foreground_prob = pd.DataFrame(
                1 - py_mixings,
                columns=pro_names[protein_mask],
                index=adata.obs_names[indices],
            )
            return foreground_prob
Beispiel #58
0
    def forward(self, in_modalities):
        umask = in_modalities[-1]
        in_modalities = in_modalities[:-2]

        batch_size = in_modalities[0].shape[0]
        time_stamps = in_modalities[0].shape[1]

        #Unimodal
        all_h = []

        for modality, dim, lstm, dropout, fc in zip(in_modalities,
                                                    self.hidden_dims,
                                                    self.lstms, self.drop_outs,
                                                    self.fcs):

            self.h = torch.zeros(batch_size, dim).to(self.device)
            self.c = torch.zeros(batch_size, dim).to(self.device)

            h = []
            for t in range(time_stamps):
                #Apply the mask dirrectly on the data
                input_u = modality[:, t, :] * umask[:, t].unsqueeze(dim=-1)
                self.h, self.c = lstm(input_u, (self.h, self.c))
                self.h = torch.tanh(self.h)
                self.h = dropout(self.h)
                h.append(torch.tanh(fc(self.h)))

            all_h.append(h)

        #Multimodal
        utterance_features = [torch.stack(h, dim=-2) for h in all_h]

        dialogue_utterance_feature = torch.cat(utterance_features, dim=-1)

        self.h_dialogue = torch.zeros(batch_size,
                                      self.dialogue_hidden_dim).to(self.device)
        self.c_dialogue = torch.zeros(batch_size,
                                      self.dialogue_hidden_dim).to(self.device)

        all_h_dialogue = []
        for t in range(time_stamps):
            input_m = dialogue_utterance_feature[:, t, :] * umask[:,
                                                                  t].unsqueeze(
                                                                      dim=-1)
            self.h_dialogue, self.c_dialogue = self.dialogue_lstm(
                input_m, (self.h_dialogue, self.c_dialogue))
            self.h_dialogue = self.drop_out(self.h_dialogue)
            all_h_dialogue.append(torch.tanh(self.fc_out(self.h_dialogue)))

        output_emo = [self.smax_fc_emo(_h) for _h in all_h_dialogue]

        output_act = [self.smax_fc_act(_h) for _h in all_h_dialogue]

        #Stack hidden states
        output_emo = torch.stack(output_emo, dim=-2)

        output_act = torch.stack(output_act, dim=-2)

        log_prob_emo = F.log_softmax(output_emo,
                                     2)  # batch, seq_len,  n_classes

        log_prob_act = F.log_softmax(output_act,
                                     2)  # batch, seq_len,  n_classes

        return log_prob_emo, log_prob_act
Beispiel #59
0
            # Add to buffer.
            instruction_data_cuda = [
                torch.tensor(t, dtype=torch.float, device=device)
                for t in instruction_data
            ]
            replay_buffer.append(instruction_data_cuda)

            # Check for minimum replay size.
            if len(replay_buffer) < REPLAY_MIN:
                print('Waiting for minimum buffer size ... {}/{}'.format(
                    len(replay_buffer), REPLAY_MIN))
                continue

            # Sample training mini-batch.
            sampled_evaluations = replay_buffer.sample(REPLAY_SAMPLE_SIZE)
            sampled_contexts = torch.stack([t[0] for t in sampled_evaluations])
            sampled_states = torch.stack([t[1] for t in sampled_evaluations])
            sampled_params = torch.stack([t[2] for t in sampled_evaluations])
            sampled_values = torch.stack([t[3] for t in sampled_evaluations])

            # Update critic.
            critic_loss = torch.distributions.Normal(*critic_model(sampled_contexts, sampled_states, sampled_params)) \
                    .log_prob(sampled_values).mean(dim=-1)
            critic_model_optimizer.zero_grad()
            gen_model_optimizer.zero_grad()
            (-critic_loss).backward()
            torch.nn.utils.clip_grad_norm_(critic_model.parameters(), 1.0)
            critic_model_optimizer.step()

            # Update params model.
            (macro_actions, macro_actions_entropy) = gen_model.rsample(
Beispiel #60
0
    def __init__(self, opts):
        super(CTMNet, self).__init__()

        self.opts = opts
        if self.opts.fsl.ctm:
            # use forward_CTM method
            self.epsilon = .0001
            self.L = 5
            self.no_bp_P_L = False
            self.deactivate_CE = self.opts.ctmnet.deactivate_CE
            self.use_OT_net = self.opts.ctmnet.use_OT
            self.pred_source = self.opts.ctmnet.pred_source  # 'both'   # 'score', 'dist', 'both'

            self.use_relation_net = self.opts.ctmnet.CE_use_relation
            self.dnet = self.opts.ctmnet.dnet  # dnet or baseline
            self.dnet_out_c = self.opts.ctmnet.dnet_out_c  # define the reshaper
            try:
                self.dnet_supp_manner = self.opts.ctmnet.dnet_supp_manner
                self.mp_mean = self.opts.ctmnet.dnet_mp_mean
                self.delete_mp = self.opts.ctmnet.dnet_delete_mp
                self.use_discri_loss = self.opts.ctmnet.use_discri_loss
                self.discri_random_target = self.opts.ctmnet.discri_random_target
                self.discri_random_weight = self.opts.ctmnet.discri_random_weight
                self.discri_test_update = self.opts.ctmnet.discri_test_update
                self.discri_test_update_fac = self.opts.ctmnet.discri_test_update_fac
                self.discri_see_weights = self.opts.ctmnet.discri_see_weights
                self.discri_zz = self.opts.ctmnet.zz
            except:
                self.use_discri_loss = False
            try:
                self.baseline_manner = self.opts.ctmnet.baseline_manner
            except:
                self.baseline_manner = ''
        else:
            self.CE_loss = opts.fsl.CE_loss
            self.swap = opts.fsl.swap
            if self.swap:
                self.swap_num = opts.fsl.swap_num

        _logger = opts.logger
        _logger('Building up models ...')
        # feature extractor
        in_c = 1 if opts.dataset.name == 'omniglot' else 3
        print("-----------------CNN ENCODER-----------------")
        self.repnet = feat_extract(self.opts.model.resnet_pretrain,
                                   opts=opts,
                                   structure=opts.model.structure,
                                   in_c=in_c)

        input_bs = opts.fsl.n_way[0] * opts.fsl.k_shot[0]
        random_input = torch.rand(input_bs, in_c, opts.data.im_size,
                                  opts.data.im_size)
        repnet_out = self.repnet(random_input)
        repnet_sz = repnet_out.size()
        assert repnet_sz[2] == repnet_sz[3]
        _logger('\trepnet output sz: {} (assume bs=n_way*k_shot)'.format(
            repnet_sz))
        self.c = repnet_sz[1]  # supposed to be 64
        self.d = repnet_sz[2]

        if self.opts.fsl.ctm:
            if self.use_OT_net:
                self.inplanes = 4 * self.c
                self.critic_sup = nn.Sequential(
                    self._make_layer(Bottleneck, 128, 4, stride=1),
                    self._make_layer(Bottleneck, 64, 3, stride=1))
                self.inplanes = 4 * self.c
                self.critic_que = nn.Sequential(
                    self._make_layer(Bottleneck, 128, 4, stride=1),
                    self._make_layer(Bottleneck, 64, 3, stride=1))
            _embedding = repnet_out

            if self.baseline_manner == 'sample_wise_similar':
                assert self.opts.model.structure == 'shallow'
                input_c = _embedding.size(1)
                self.additional_repnet = nn.Sequential(
                    nn.Conv2d(input_c, input_c, kernel_size=3, padding=1),
                    nn.BatchNorm2d(input_c, momentum=1, affine=True),
                    nn.ReLU())

            # RESHAPER
            if not (not self.dnet and self.baseline_manner == 'no_reshaper'):
                assert np.mod(self.dnet_out_c, 4) == 0
                out_size = int(self.dnet_out_c / 4)
                self.inplanes = _embedding.size(1)
                if self.opts.model.structure.startswith('resnet'):
                    self.reshaper = nn.Sequential(
                        self._make_layer(Bottleneck, out_size * 2, 3,
                                         stride=1),
                        self._make_layer(Bottleneck, out_size, 2, stride=1))
                else:
                    print("-----------------RESHAPER-----------------")
                    self.reshaper = self._make_layer(Bottleneck,
                                                     out_size,
                                                     4,
                                                     stride=1,
                                                     name="reshaper")
                _out_downsample = self.reshaper(_embedding)

            # CONCENTRATOR AND PROJECTOR
            if self.dnet:
                if self.mp_mean:  ## mp = main_component
                    self.inplanes = _embedding.size(1)
                else:
                    # concatenate along the channel for all samples in each class
                    self.inplanes = self.opts.fsl.k_shot[0] * _embedding.size(
                        1)
                if self.opts.model.structure.startswith('resnet'):
                    self.main_component = nn.Sequential(
                        self._make_layer(Bottleneck, out_size * 2, 3,
                                         stride=1),
                        self._make_layer(Bottleneck, out_size, 2, stride=1))
                else:
                    print("-----------------CONCENTRATOR-----------------")
                    tmp_inplaces = self.inplanes
                    self.main_component1 = self._make_layer(
                        Bottleneck,
                        out_size,
                        4,
                        stride=1,
                        name="concentrator2",
                        change_inplanes=True)
                    self.inplanes = tmp_inplaces
                    self.main_component2 = self._make_layer(
                        Bottleneck_k5,
                        out_size,
                        4,
                        stride=1,
                        name="concentrator1",
                        change_inplanes=True)

                # projector
                if self.delete_mp:  ## mp = main_component
                    assert self.opts.fsl.k_shot[
                        0] == 1  ## of k=1 one shot learning, dont need concentrator
                    del self.main_component
                    # input_c for Projector, no mp
                    self.inplanes = self.opts.fsl.n_way[0] * _embedding.size(1)
                else:
                    # input_c for Projector, has mp
                    self.inplanes = self.opts.fsl.n_way[0] * out_size * 4

                if self.opts.model.structure.startswith('resnet'):
                    self.projection = nn.Sequential(
                        self._make_layer(Bottleneck, out_size * 2, 3,
                                         stride=1),
                        self._make_layer(Bottleneck, out_size, 2, stride=1))
                else:
                    print("-----------------PROJECTOR-----------------")
                    self.projection = self._make_layer(Bottleneck,
                                                       out_size,
                                                       4,
                                                       stride=1,
                                                       name="projector")

                # deprecated; kept for legacy
                if self.use_discri_loss:
                    # 40 x 19 x 19 = 14440
                    input_c = _out_downsample.size(1) * _out_downsample.size(
                        2) * _out_downsample.size(2)
                    if self.discri_zz:
                        self.disc_fc = nn.ModuleList([
                            nn.Sequential(
                                nn.Linear(input_c, int(input_c / 8)),
                                nn.BatchNorm1d(int(input_c / 8)),
                                nn.ReLU(),
                            ),
                            MyLinear(int(input_c / 8),
                                     256,
                                     True,
                                     reset_each_iter=self.discri_random_weight)
                        ])
                    else:
                        self.disc_fc = nn.ModuleList([
                            nn.Sequential(nn.Linear(input_c, int(input_c / 8)),
                                          nn.BatchNorm1d(int(input_c / 8)),
                                          nn.ReLU(),
                                          nn.Linear(int(input_c / 8), 256),
                                          nn.BatchNorm1d(256), nn.ReLU()),
                            # nn.Linear(int(input_c/8), self.opts.fsl.n_way[0]),
                            MyLinear(256,
                                     self.opts.fsl.n_way[0],
                                     bias=(not self.discri_random_weight),
                                     reset_each_iter=self.discri_random_weight)
                        ])

            # RELATION METRIC
            if self.use_relation_net:
                # relation sub_net
                if hasattr(self, 'reshaper'):
                    _input = _out_downsample
                else:
                    _input = _embedding

                if self.opts.model.relation_net == 'res_block':
                    # (256); it is "2" because combining two embedding
                    self.inplanes = 2 * _input.size(1)
                    self.relation1 = self._make_layer(Bottleneck,
                                                      32,
                                                      2,
                                                      stride=2)
                    self.relation2 = self._make_layer(Bottleneck,
                                                      16,
                                                      2,
                                                      stride=1)

                    _combine = torch.stack([_input, _input],
                                           dim=1).view(_input.size(0), -1,
                                                       _input.size(2),
                                                       _input.size(3))
                    _out = self.relation2(self.relation1(_combine))
                    self.fc_input_c = _out.size(1) * _out.size(2) * _out.size(
                        3)
                    _half = int(self.fc_input_c / 2)
                    self.fc = nn.Sequential(nn.Linear(self.fc_input_c, _half),
                                            nn.BatchNorm1d(_half),
                                            nn.ReLU(inplace=True),
                                            nn.Linear(_half, 1))
                elif self.opts.model.relation_net == 'simple':
                    input_c = 2 * _input.size(1)
                    self.relation1 = nn.Sequential(
                        nn.Conv2d(input_c, 64, kernel_size=3, padding=1),
                        nn.BatchNorm2d(64, momentum=1, affine=True), nn.ReLU(),
                        nn.MaxPool2d(2))
                    self.relation2 = nn.Sequential(
                        nn.Conv2d(64, 64, kernel_size=3, padding=1),
                        nn.BatchNorm2d(64, momentum=1, affine=True),
                        nn.ReLU(),
                        # nn.MaxPool2d(2)
                    )

                    _combine = torch.stack([_input, _input],
                                           dim=1).view(_input.size(0), -1,
                                                       _input.size(2),
                                                       _input.size(3))
                    _out = self.relation2(self.relation1(_combine))
                    self.fc_input_c = _out.size(1) * _out.size(2) * _out.size(
                        3)
                    _half = int(self.fc_input_c / 2)
                    self.fc = nn.Sequential(
                        nn.Linear(self.fc_input_c, _half),
                        nn.ReLU(),
                        nn.Linear(_half,
                                  1),  # except no sigmoid since we use CE
                    )
        else:
            # the original relation network
            self.inplanes = 2 * self.c
            # the original network in the relation net
            # after the relation module (three layers)
            self.relation1 = self._make_layer(Bottleneck, 128, 4, stride=2)
            self.relation2 = self._make_layer(Bottleneck, 64, 3, stride=2)

            if self.CE_loss:
                self.fc = nn.Sequential(nn.Linear(256, 64), nn.BatchNorm1d(64),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(64, 1))
            else:
                self.fc = nn.Sequential(
                    nn.Linear(256, 64),
                    nn.BatchNorm1d(64),
                    nn.ReLU(inplace=True),
                    nn.Linear(64, 1),
                    nn.Sigmoid()  # the only difference
                )
            combine = torch.stack([repnet_out, repnet_out],
                                  dim=1).view(repnet_out.size(0), -1,
                                              repnet_out.size(2),
                                              repnet_out.size(3))
            out = self.relation2(self.relation1(combine))
            _logger('\tafter layer5 sz: {} (assume bs=2)\n'.format(out.size()))
            self.pool_size = out.size(2)

        self._initialize_weights()