def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text caption = self.captions[cap_id] if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id
def preprocess(self, query, clear): if clear: words = clean_str(query) else: words = query.strip().split() #print words return words
def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text caption = self.captions[cap_id] if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None if self.tag_path is not None: vid_tag_str = self.vid2tags[video_id] # string representation tag_list = [ self.tag2idx[word[0]] for word in vid_tag_str if word[0] in self.tag2idx ] # index representation score_list = [word[1] for word in vid_tag_str] tag_one_hot = torch.zeros( self.tag_vocab_size ) # build zero vector of tag vocabulary that is used to represent tags by one-hot for idx, tag_idx in enumerate(tag_list): tag_one_hot[tag_idx] = score_list[idx] # one-hot else: tag_one_hot = torch.zeros(self.tag_vocab_size) # print tag_one_hot vid_tag = torch.Tensor(np.array(tag_one_hot)) # print ('%s:' %video_id, vid_tag) return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, vid_tag
def __getitem__(self, index): cap_id = self.cap_ids[index] caption = self.captions[cap_id] if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None return cap_tensor, cap_bow, index, cap_id
def dataLoadedVideoText_one(video2frames, video_id, visual_feats, query, bow2vec, vocab, tokenizer, options): data = [] videos = [] frame_list = video2frames[video_id] frame_vecs = [] for frame_id in frame_list: # visual_feats.read_one(frame_id) if options.do_visual_feas_norm: frame_vecs.append(do_L2_norm(visual_feats.read_one(frame_id))) else: frame_vecs.append(visual_feats.read_one(frame_id)) # Text encoding cap_tensors = [] cap_bows = [] caption_text = query[:] caption_text = ' '.join(clean_str(caption_text)) caption_text = text2Berttext(caption_text, tokenizer) caption_text = caption_text.encode("utf-8") if bow2vec is not None: cap_bow = bow2vec.mapping(caption_text) if cap_bow is None: cap_bow = torch.zeros(bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if vocab is not None: tokens = clean_str(caption_text) caption = [] caption.append(vocab('<start>')) caption.extend([vocab(token) for token in tokens]) caption.append(vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None # BERT caption_text = query[:] caption_text = ' '.join(clean_str(query)) marked_text = "[CLS] " + caption_text + " [SEP]" # print marked_text tokenized_text = tokenizer.tokenize(marked_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(indexed_tokens) segments_tensors = torch.tensor(segments_ids) caption_text = caption_text.encode("utf-8") data.append([ torch.Tensor(frame_vecs), cap_tensor, cap_bow, tokens_tensor, segments_tensors, caption_text ]) return data
def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: # l_2 normalize if (self.do_visual_feas_norm): frame_vecs.append( do_L2_norm(self.visual_feat.read_one(frame_id))) else: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text # print video_id cap_text = self.captions[cap_id] caption_text = cap_text[:] caption_text = ' '.join(clean_str(caption_text)) caption_text = text2Berttext(caption_text, self.tokenizer) caption_text = caption_text.encode("utf-8") if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption_text) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption_text) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None # BERT caption_text = cap_text[:] caption_text = ' '.join(clean_str(caption_text)) marked_text = "[CLS] " + caption_text + " [SEP]" # print marked_text tokenized_text = self.tokenizer.tokenize(marked_text) indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) # tmptext = self.tokenizer.convert_tokens_to_string(tokenized_text) segments_ids = [1] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(indexed_tokens) segments_tensors = torch.tensor(segments_ids) # caption_text = cap_text[:] # caption_text = text2Berttext(caption_text, self.tokenizer) caption_text = caption_text.encode("utf-8") # print tokens_tensor.shape # print caption.__len__() return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, tokens_tensor, segments_tensors, caption_text