def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text caption = self.captions[cap_id] if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id
def __init__(self, cap_file, visual_feat, bow2vec, vocab, n_caption=None, video2frames=None): # Captions self.captions = {} self.cap_ids = [] self.video_ids = set() self.video2frames = video2frames with open(cap_file, 'r') as cap_reader: for line in cap_reader.readlines(): cap_id, caption = line.strip().split(' ', 1) video_id = getVideoId(cap_id) self.captions[cap_id] = caption self.cap_ids.append(cap_id) self.video_ids.add(video_id) self.visual_feat = visual_feat self.bow2vec = bow2vec self.vocab = vocab self.length = len(self.cap_ids) if n_caption is not None: assert len( self.video_ids) * n_caption == self.length, "%d != %d" % ( len(self.video_ids) * n_caption, self.length)
def __init__(self, cap_file, visual_feat, bow2vec, vocab, n_caption=None, video2frames=None): # Captions self.captions = {} self.cap_ids = [] self.video_ids = set() self.video2frames = video2frames with open(cap_file, 'r') as cap_reader: for line in cap_reader.readlines(): if len(line.strip().split(' ')) < 2: continue cap_id, caption = line.strip().split(' ', 1) video_id = getVideoId(cap_id) self.captions[cap_id] = caption self.cap_ids.append(cap_id) self.video_ids.add(video_id) self.visual_feat = visual_feat self.bow2vec = bow2vec self.vocab = vocab self.length = len(self.cap_ids)
def __init__(self, cap_file, visual_feat, bow2vec, vocab, do_visual_feas_norm, n_caption=None, video2frames=None): # Captions self.captions = {} self.cap_ids = [] self.video_ids = set() self.video2frames = video2frames self.do_visual_feas_norm = do_visual_feas_norm self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) with open(cap_file, 'r') as cap_reader: for line in cap_reader.readlines(): cap_id, caption = line.strip().split(' ', 1) video_id = getVideoId(cap_id) self.captions[cap_id] = caption self.cap_ids.append(cap_id) self.video_ids.add(video_id) self.visual_feat = visual_feat self.bow2vec = bow2vec self.vocab = vocab self.length = len(self.cap_ids) if n_caption is not None: print(n_caption) assert len( self.video_ids) * n_caption == self.length, "%d != %d" % ( len(self.video_ids) * n_caption, self.length)
def read_video_ids(cap_file): video_ids_list = [] with open(cap_file, 'r') as cap_reader: for line in cap_reader.readlines(): cap_id, caption = line.strip().split(' ', 1) video_id = getVideoId(cap_id) if video_id not in video_ids_list: video_ids_list.append(video_id) return video_ids_list
def __init__(self, cap_file, visual_feat, tag_path, tag_vocab_path, bow2vec, vocab, video2frames=None): # Captions self.captions = {} self.cap_ids = [] self.video_ids = set() self.video2frames = video2frames self.tag_path = tag_path with open(cap_file, 'r') as cap_reader: for line in cap_reader.readlines(): cap_id, caption = line.strip().split(' ', 1) video_id = getVideoId(cap_id) self.captions[cap_id] = caption self.cap_ids.append(cap_id) self.video_ids.add(video_id) self.visual_feat = visual_feat self.bow2vec = bow2vec self.vocab = vocab self.length = len(self.cap_ids) self.tag_vocab_list = json.load(open(tag_vocab_path, 'r')) self.tag_vocab_size = len(self.tag_vocab_list) self.tag2idx = dict( zip(self.tag_vocab_list, range(self.tag_vocab_size))) # self.vid2tags = json.load(open(tag_path, 'r')) #read the json file of tag self.vid2tags = {} if tag_path is not None: for line in open(tag_path).readlines(): # print(line) if len(line.strip().split( "\t", 1)) < 2: # no tag available for a specific video vid = line.strip().split("\t", 1)[0] self.vid2tags[vid] = [] else: vid, or_tags = line.strip().split("\t", 1) tags = [x.split(':')[0] for x in or_tags.strip().split()] # weighed concept scores scores = [ float(x.split(':')[1]) for x in or_tags.strip().split() ] scores = np.array(scores) / max(scores) self.vid2tags[vid] = list(zip(tags, scores))
def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text caption = self.captions[cap_id] if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None if self.tag_path is not None: vid_tag_str = self.vid2tags[video_id] # string representation tag_list = [ self.tag2idx[word[0]] for word in vid_tag_str if word[0] in self.tag2idx ] # index representation score_list = [word[1] for word in vid_tag_str] tag_one_hot = torch.zeros( self.tag_vocab_size ) # build zero vector of tag vocabulary that is used to represent tags by one-hot for idx, tag_idx in enumerate(tag_list): tag_one_hot[tag_idx] = score_list[idx] # one-hot else: tag_one_hot = torch.zeros(self.tag_vocab_size) # print tag_one_hot vid_tag = torch.Tensor(np.array(tag_one_hot)) # print ('%s:' %video_id, vid_tag) return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, vid_tag
def __getitem__(self, index): cap_id = self.cap_ids[index] video_id = getVideoId(cap_id) # video frame_list = self.video2frames[video_id] frame_vecs = [] for frame_id in frame_list: # l_2 normalize if (self.do_visual_feas_norm): frame_vecs.append( do_L2_norm(self.visual_feat.read_one(frame_id))) else: frame_vecs.append(self.visual_feat.read_one(frame_id)) frames_tensor = torch.Tensor(frame_vecs) # text # print video_id cap_text = self.captions[cap_id] caption_text = cap_text[:] caption_text = ' '.join(clean_str(caption_text)) caption_text = text2Berttext(caption_text, self.tokenizer) caption_text = caption_text.encode("utf-8") if self.bow2vec is not None: cap_bow = self.bow2vec.mapping(caption_text) if cap_bow is None: cap_bow = torch.zeros(self.bow2vec.ndims) else: cap_bow = torch.Tensor(cap_bow) else: cap_bow = None if self.vocab is not None: tokens = clean_str(caption_text) caption = [] caption.append(self.vocab('<start>')) caption.extend([self.vocab(token) for token in tokens]) caption.append(self.vocab('<end>')) cap_tensor = torch.Tensor(caption) else: cap_tensor = None # BERT caption_text = cap_text[:] caption_text = ' '.join(clean_str(caption_text)) marked_text = "[CLS] " + caption_text + " [SEP]" # print marked_text tokenized_text = self.tokenizer.tokenize(marked_text) indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) # tmptext = self.tokenizer.convert_tokens_to_string(tokenized_text) segments_ids = [1] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(indexed_tokens) segments_tensors = torch.tensor(segments_ids) # caption_text = cap_text[:] # caption_text = text2Berttext(caption_text, self.tokenizer) caption_text = caption_text.encode("utf-8") # print tokens_tensor.shape # print caption.__len__() return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, tokens_tensor, segments_tensors, caption_text