def forward(self, en, cn): en_embed = self.embedding(en) cn = self.linear(cn) _, (en_embed, _) = self.lstm(en_embed) en_embed = en_embed.squeeze(0) return l2norm(en_embed), l2norm(cn)
def forward(self, en, en_lengths, en_index, cn, cn_lengths, cn_index): """ Input Variable: input_var: A variables whose size is (B,W), B is the batch size and W is the longest sequence length in the batch input_lengths: The lengths of each element in the batch. hidden: The hidden state variable whose size is (num_layer*num_directions,batch_size,hidden_size) Output: output: A variable with tensor size W*B*N, W is the maximum length of the batch, B is the batch size, and N is the hidden size hidden: The hidden state variable with tensor size (num_layer*num_direction,B,N) """ en = self.sorted_forward(en, en_lengths, en_index) cn = self.sorted_forward(cn, cn_lengths, cn_index) return l2norm(en), l2norm(cn)
def txt2image(captions, images, npts=None, verbose=False): """ :param captions: [5N, K] :param images: [N , K] :param npts: :param verbose: :return: """ if npts is None: npts = images.size()[0] ranks = np.zeros(5 * npts) images = l2norm(images) for index in range(npts): # Get query captions queries = captions[5 * index:5 * index + 5] # Compute scores d = torch.mm(queries, images.t()) for i in range(d.size()[0]): d_sorted, inds = torch.sort(d[i], descending=True) inds = inds.data.squeeze(0).cpu().numpy() ranks[5 * index + i] = np.where(inds == index)[0][0] # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 if verbose: print( " * Text to image scores: R@1: %.1f, R@5: %.1f, R@10: %.1f, Medr: %.1f" % (r1, r5, r10, medr)) return (r1, r5, r10, medr)
def image2txt(images, captions, npts=None, verbose=False): """ Image <--> 5 * captions Image->Text (Image, Annotation) :param images: (N,K) matrix of images :param captions: (5N,K) matrix of captions :param npts: numbers of image-text group :return: Recall & Median Rank """ if npts is None: npts = images.size()[0] ranks = np.zeros(npts) # Each Image's first rank captions = l2norm(captions) # L2Norm for index in range(npts): # Get query Image im = images[index] im = im.unsqueeze(0) # Compute scores im = l2norm(im) d = torch.mm(im, captions.t()) d_sorted, ins = torch.sort(d, descending=True) # 从大到小的距离和索引 inds = ins.data.squeeze(0).cpu().numpy() # Score rank = 1e20 # find the highest ranking for i in range(5 * index, 5 * index + 5, 1): tmp = np.where(inds == i)[0][0] if tmp < rank: rank = tmp ranks[index] = rank # Compute metrics r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) medr = np.floor(np.median(ranks)) + 1 if verbose: print( " * Image to text scores: R@1: %.1f, R@5: %.1f, R@10: %.1f, Medr: %.1f" % (r1, r5, r10, medr)) return (r1, r5, r10, medr)
def forward_sens(self, x): x_emb = self.embedding(x) _, (x_emb, _) = self.lstm(x_emb) x_cat = x_emb.squeeze(0) return l2norm(x_cat)
def forward_sens(self, x, x_lengths): x_cat = self.single_forward(x, x_lengths) return l2norm(x_cat)