def get_triplet_batch(self, cross_modal_pairs, gpu=False): assert isinstance(cross_modal_pairs, list) modalities = set([pair[0] for pair in cross_modal_pairs]) if self.x_to_y_name[0] in modalities and self.y_to_x_name[ 0] in modalities: longest_modality = self.x_len if self.x_len > self.y_len else self.y_len elif self.x_to_y_name[0] in modalities: longest_modality = self.x_len elif self.y_to_x_name[0] in modalities: longest_modality = self.y_len else: raise Exception( 'No modalities found in cross_modal_pairs: {}.'.format( cross_modal_pair)) num_batches = int(np.ceil(longest_modality / self.batch_size)) x_idxs = np.array(range(self.x_len)) y_idxs = np.array(range(self.y_len)) np.random.shuffle(x_idxs) np.random.shuffle(y_idxs) for i in range(num_batches): x_start, x_end = self._get_triplet_batch_start_end(i, self.x_len) y_start, y_end = self._get_triplet_batch_start_end(i, self.y_len) batch_dict = {} for cross_modal_pair in cross_modal_pairs: if cross_modal_pair[0] == self.x_to_y_name[0]: anch_vals = self._x start = x_start end = x_end idxs = x_idxs else: anch_vals = self._y start = y_start end = y_end idxs = y_idxs if cross_modal_pair[1] == self.x_to_y_name[1]: other_vals = self._y else: other_vals = self._x triplets = self.triplets[cross_modal_pair] batch_anchors_idxs = np.repeat(triplets[0][idxs[start:end]], self.num_triplets) batch_pos_idxs = triplets[1][idxs[start:end]].flatten() if self.sampling_method in ['random']: batch_neg_idxs = triplets[2][idxs[start:end]].flatten() else: neg = np.repeat(triplets[2][idxs[start:end]], self.num_triplets) anch = to_tensor(anch_vals[batch_anchors_idxs, :], gpu=gpu) pos = to_tensor(other_vals[batch_pos_idxs, :], gpu=gpu) if self.sampling_method in ['random']: neg = to_tensor(other_vals[batch_neg_idxs, :], gpu=gpu) batch_dict[cross_modal_pair] = (anch, pos, neg) yield batch_dict
def _prepare_data(img, img_transform, cfg, device): ori_shape = img.shape img, img_shape, pad_shape, scale_factor = img_transform( img, scale=cfg.data.test.img_scale, keep_ratio=cfg.data.test.get('resize_keep_ratio', True)) img = to_tensor(img).to(device).unsqueeze(0) img_meta = [ dict(ori_shape=ori_shape, img_shape=img_shape, pad_shape=pad_shape, scale_factor=scale_factor, flip=False) ] return dict(img=[img], img_meta=[img_meta])
def forward(self, x): x1 = self.fc(x) if self.add_batch_norm: x1 = self.batch_norm(x1) x = th.cat((x, x1), 1) return F.glu(x, 1) if __name__ == '__main__': sme = Single_Modality_Embedding([1024, 256], 4) from datasets import to_tensor sme(to_tensor(np.random.rand(64, 1024))) modality_dict = { 'text': { 'layer_sizes': [200, 256], 'num_layers': 2 }, 'visual': { 'layer_sizes': [1024, 256], 'num_layers': 2 } } mmen = MMEN(modality_dict) xs = mmen([{ 'text': to_tensor(np.random.rand(64, 200)) }, { 'visual': to_tensor(np.random.rand(64, 1024))
't': { 'num_layers': 2, 'layer_sizes': [200, 256] }, 'v': { 'num_layers': 2, 'layer_sizes': [1024, 256] } }, 'noun': { 't': { 'num_layers': 2, 'layer_sizes': [200, 256] }, 'v': { 'num_layers': 2, 'layer_sizes': [1024, 256] } } } jpose = JPOSE(modality_dict) from datasets import to_tensor v = to_tensor(np.zeros((64, 1024))) t = to_tensor(np.zeros((64, 200))) verb = {'verb': [{'v': v}, {'t': t}, {'t': t + 1}]} noun = {'noun': [{'v': v}, {'t': t}, {'t': t + 1}]} jpose(verb) jpose(noun) action = {**verb, **noun} jpose(action, action_output=True)
def get_eval_batch(self, gpu=False): return to_tensor(self._x, gpu=gpu), to_tensor(self._y, gpu=gpu)
def forward(self, x, pos, neg): d_pos = (x - pos).pow(2).sum(1) d_neg = (x - neg).pow(2).sum(1) losses = F.relu(self.margin + d_pos - d_neg) return self.weight * self.reduction(losses) if __name__ == '__main__': triplet_loss_m = TripletLoss(0.1, 1.0) triplet_loss_m_0_1 = TripletLoss(0.1, 0.1) triplet_loss_s = TripletLoss(0.1, 1.0, 'sum') triplet_loss_n = TripletLoss(0.1, 1.0, 'none') from datasets import to_tensor xs = to_tensor(np.random.rand(64, 256)) pos = to_tensor(np.random.rand(64, 256)) neg = to_tensor(np.random.rand(64, 256)) loss_m = triplet_loss_m(xs, pos, neg) loss_m_0_1 = triplet_loss_m_0_1(xs, pos, neg) loss_s = triplet_loss_s(xs, pos, neg) loss_n = triplet_loss_n(xs, pos, neg) assert 0.1 * loss_m == loss_m_0_1 assert loss_n.mean() == loss_m assert loss_n.sum() == loss_s assert loss_m.shape == loss_s.shape assert loss_n.shape == th.Size([64])