Exemple #1
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            frame_vecs.append(self.visual_feat.read_one(frame_id))
        frames_tensor = torch.Tensor(frame_vecs)

        # text
        caption = self.captions[cap_id]
        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id
Exemple #2
0
 def __init__(self,
              cap_file,
              visual_feat,
              bow2vec,
              vocab,
              n_caption=None,
              video2frames=None):
     # Captions
     self.captions = {}
     self.cap_ids = []
     self.video_ids = set()
     self.video2frames = video2frames
     with open(cap_file, 'r') as cap_reader:
         for line in cap_reader.readlines():
             cap_id, caption = line.strip().split(' ', 1)
             video_id = getVideoId(cap_id)
             self.captions[cap_id] = caption
             self.cap_ids.append(cap_id)
             self.video_ids.add(video_id)
     self.visual_feat = visual_feat
     self.bow2vec = bow2vec
     self.vocab = vocab
     self.length = len(self.cap_ids)
     if n_caption is not None:
         assert len(
             self.video_ids) * n_caption == self.length, "%d != %d" % (
                 len(self.video_ids) * n_caption, self.length)
Exemple #3
0
 def __init__(self,
              cap_file,
              visual_feat,
              bow2vec,
              vocab,
              n_caption=None,
              video2frames=None):
     # Captions
     self.captions = {}
     self.cap_ids = []
     self.video_ids = set()
     self.video2frames = video2frames
     with open(cap_file, 'r') as cap_reader:
         for line in cap_reader.readlines():
             if len(line.strip().split(' ')) < 2:
                 continue
             cap_id, caption = line.strip().split(' ', 1)
             video_id = getVideoId(cap_id)
             self.captions[cap_id] = caption
             self.cap_ids.append(cap_id)
             self.video_ids.add(video_id)
     self.visual_feat = visual_feat
     self.bow2vec = bow2vec
     self.vocab = vocab
     self.length = len(self.cap_ids)
Exemple #4
0
 def __init__(self,
              cap_file,
              visual_feat,
              bow2vec,
              vocab,
              do_visual_feas_norm,
              n_caption=None,
              video2frames=None):
     # Captions
     self.captions = {}
     self.cap_ids = []
     self.video_ids = set()
     self.video2frames = video2frames
     self.do_visual_feas_norm = do_visual_feas_norm
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                    do_lower_case=True)
     with open(cap_file, 'r') as cap_reader:
         for line in cap_reader.readlines():
             cap_id, caption = line.strip().split(' ', 1)
             video_id = getVideoId(cap_id)
             self.captions[cap_id] = caption
             self.cap_ids.append(cap_id)
             self.video_ids.add(video_id)
     self.visual_feat = visual_feat
     self.bow2vec = bow2vec
     self.vocab = vocab
     self.length = len(self.cap_ids)
     if n_caption is not None:
         print(n_caption)
         assert len(
             self.video_ids) * n_caption == self.length, "%d != %d" % (
                 len(self.video_ids) * n_caption, self.length)
Exemple #5
0
def read_video_ids(cap_file):
    video_ids_list = []
    with open(cap_file, 'r') as cap_reader:
        for line in cap_reader.readlines():
            cap_id, caption = line.strip().split(' ', 1)
            video_id = getVideoId(cap_id)
            if video_id not in video_ids_list:
                video_ids_list.append(video_id)
    return video_ids_list
Exemple #6
0
    def __init__(self,
                 cap_file,
                 visual_feat,
                 tag_path,
                 tag_vocab_path,
                 bow2vec,
                 vocab,
                 video2frames=None):
        # Captions
        self.captions = {}
        self.cap_ids = []
        self.video_ids = set()
        self.video2frames = video2frames
        self.tag_path = tag_path
        with open(cap_file, 'r') as cap_reader:
            for line in cap_reader.readlines():
                cap_id, caption = line.strip().split(' ', 1)
                video_id = getVideoId(cap_id)
                self.captions[cap_id] = caption
                self.cap_ids.append(cap_id)
                self.video_ids.add(video_id)
        self.visual_feat = visual_feat
        self.bow2vec = bow2vec
        self.vocab = vocab
        self.length = len(self.cap_ids)

        self.tag_vocab_list = json.load(open(tag_vocab_path, 'r'))
        self.tag_vocab_size = len(self.tag_vocab_list)
        self.tag2idx = dict(
            zip(self.tag_vocab_list, range(self.tag_vocab_size)))

        # self.vid2tags = json.load(open(tag_path, 'r'))    #read the json file of tag
        self.vid2tags = {}
        if tag_path is not None:
            for line in open(tag_path).readlines():
                # print(line)
                if len(line.strip().split(
                        "\t", 1)) < 2:  # no tag available for a specific video
                    vid = line.strip().split("\t", 1)[0]
                    self.vid2tags[vid] = []
                else:
                    vid, or_tags = line.strip().split("\t", 1)
                    tags = [x.split(':')[0] for x in or_tags.strip().split()]

                    # weighed concept scores
                    scores = [
                        float(x.split(':')[1])
                        for x in or_tags.strip().split()
                    ]
                    scores = np.array(scores) / max(scores)

                    self.vid2tags[vid] = list(zip(tags, scores))
Exemple #7
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            frame_vecs.append(self.visual_feat.read_one(frame_id))
        frames_tensor = torch.Tensor(frame_vecs)

        # text
        caption = self.captions[cap_id]
        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        if self.tag_path is not None:
            vid_tag_str = self.vid2tags[video_id]  # string representation
            tag_list = [
                self.tag2idx[word[0]] for word in vid_tag_str
                if word[0] in self.tag2idx
            ]  # index representation
            score_list = [word[1] for word in vid_tag_str]
            tag_one_hot = torch.zeros(
                self.tag_vocab_size
            )  # build zero vector of tag vocabulary that is used to represent tags by one-hot
            for idx, tag_idx in enumerate(tag_list):
                tag_one_hot[tag_idx] = score_list[idx]  # one-hot
        else:
            tag_one_hot = torch.zeros(self.tag_vocab_size)
        # print tag_one_hot
        vid_tag = torch.Tensor(np.array(tag_one_hot))
        # print ('%s:' %video_id, vid_tag)

        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, vid_tag
Exemple #8
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            # l_2 normalize
            if (self.do_visual_feas_norm):
                frame_vecs.append(
                    do_L2_norm(self.visual_feat.read_one(frame_id)))
            else:
                frame_vecs.append(self.visual_feat.read_one(frame_id))
        frames_tensor = torch.Tensor(frame_vecs)

        # text
        # print video_id
        cap_text = self.captions[cap_id]
        caption_text = cap_text[:]
        caption_text = ' '.join(clean_str(caption_text))
        caption_text = text2Berttext(caption_text, self.tokenizer)
        caption_text = caption_text.encode("utf-8")

        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption_text)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption_text)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        # BERT
        caption_text = cap_text[:]
        caption_text = ' '.join(clean_str(caption_text))
        marked_text = "[CLS] " + caption_text + " [SEP]"
        # print marked_text
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # tmptext = self.tokenizer.convert_tokens_to_string(tokenized_text)

        segments_ids = [1] * len(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor(indexed_tokens)
        segments_tensors = torch.tensor(segments_ids)

        # caption_text = cap_text[:]
        # caption_text = text2Berttext(caption_text, self.tokenizer)
        caption_text = caption_text.encode("utf-8")

        # print tokens_tensor.shape
        # print caption.__len__()
        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, tokens_tensor, segments_tensors, caption_text