Beispiel #1
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            frame_vecs.append(self.visual_feat.read_one(frame_id))

        frames_tensor = torch.Tensor(frame_vecs)

        # text
        caption = self.captions[cap_id]
        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id
Beispiel #2
0
 def preprocess(self, query, clear):
     if clear:
         words = clean_str(query)
     else:
         words = query.strip().split()
     #print words
     return words
Beispiel #3
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            frame_vecs.append(self.visual_feat.read_one(frame_id))
        frames_tensor = torch.Tensor(frame_vecs)

        # text
        caption = self.captions[cap_id]
        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        if self.tag_path is not None:
            vid_tag_str = self.vid2tags[video_id]  # string representation
            tag_list = [
                self.tag2idx[word[0]] for word in vid_tag_str
                if word[0] in self.tag2idx
            ]  # index representation
            score_list = [word[1] for word in vid_tag_str]
            tag_one_hot = torch.zeros(
                self.tag_vocab_size
            )  # build zero vector of tag vocabulary that is used to represent tags by one-hot
            for idx, tag_idx in enumerate(tag_list):
                tag_one_hot[tag_idx] = score_list[idx]  # one-hot
        else:
            tag_one_hot = torch.zeros(self.tag_vocab_size)
        # print tag_one_hot
        vid_tag = torch.Tensor(np.array(tag_one_hot))
        # print ('%s:' %video_id, vid_tag)

        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, vid_tag
Beispiel #4
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]

        caption = self.captions[cap_id]
        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        return cap_tensor, cap_bow, index, cap_id
def dataLoadedVideoText_one(video2frames, video_id, visual_feats, query,
                            bow2vec, vocab, tokenizer, options):
    data = []

    videos = []

    frame_list = video2frames[video_id]
    frame_vecs = []
    for frame_id in frame_list:
        # visual_feats.read_one(frame_id)
        if options.do_visual_feas_norm:
            frame_vecs.append(do_L2_norm(visual_feats.read_one(frame_id)))
        else:
            frame_vecs.append(visual_feats.read_one(frame_id))
    # Text encoding
    cap_tensors = []
    cap_bows = []

    caption_text = query[:]
    caption_text = ' '.join(clean_str(caption_text))
    caption_text = text2Berttext(caption_text, tokenizer)
    caption_text = caption_text.encode("utf-8")

    if bow2vec is not None:
        cap_bow = bow2vec.mapping(caption_text)
        if cap_bow is None:
            cap_bow = torch.zeros(bow2vec.ndims)
        else:
            cap_bow = torch.Tensor(cap_bow)
    else:
        cap_bow = None

    if vocab is not None:
        tokens = clean_str(caption_text)
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        cap_tensor = torch.Tensor(caption)
    else:
        cap_tensor = None

    # BERT
    caption_text = query[:]
    caption_text = ' '.join(clean_str(query))
    marked_text = "[CLS] " + caption_text + " [SEP]"
    # print marked_text
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor(indexed_tokens)
    segments_tensors = torch.tensor(segments_ids)

    caption_text = caption_text.encode("utf-8")

    data.append([
        torch.Tensor(frame_vecs), cap_tensor, cap_bow, tokens_tensor,
        segments_tensors, caption_text
    ])

    return data
Beispiel #6
0
    def __getitem__(self, index):
        cap_id = self.cap_ids[index]
        video_id = getVideoId(cap_id)

        # video
        frame_list = self.video2frames[video_id]
        frame_vecs = []
        for frame_id in frame_list:
            # l_2 normalize
            if (self.do_visual_feas_norm):
                frame_vecs.append(
                    do_L2_norm(self.visual_feat.read_one(frame_id)))
            else:
                frame_vecs.append(self.visual_feat.read_one(frame_id))
        frames_tensor = torch.Tensor(frame_vecs)

        # text
        # print video_id
        cap_text = self.captions[cap_id]
        caption_text = cap_text[:]
        caption_text = ' '.join(clean_str(caption_text))
        caption_text = text2Berttext(caption_text, self.tokenizer)
        caption_text = caption_text.encode("utf-8")

        if self.bow2vec is not None:
            cap_bow = self.bow2vec.mapping(caption_text)
            if cap_bow is None:
                cap_bow = torch.zeros(self.bow2vec.ndims)
            else:
                cap_bow = torch.Tensor(cap_bow)
        else:
            cap_bow = None

        if self.vocab is not None:
            tokens = clean_str(caption_text)
            caption = []
            caption.append(self.vocab('<start>'))
            caption.extend([self.vocab(token) for token in tokens])
            caption.append(self.vocab('<end>'))
            cap_tensor = torch.Tensor(caption)
        else:
            cap_tensor = None

        # BERT
        caption_text = cap_text[:]
        caption_text = ' '.join(clean_str(caption_text))
        marked_text = "[CLS] " + caption_text + " [SEP]"
        # print marked_text
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # tmptext = self.tokenizer.convert_tokens_to_string(tokenized_text)

        segments_ids = [1] * len(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor(indexed_tokens)
        segments_tensors = torch.tensor(segments_ids)

        # caption_text = cap_text[:]
        # caption_text = text2Berttext(caption_text, self.tokenizer)
        caption_text = caption_text.encode("utf-8")

        # print tokens_tensor.shape
        # print caption.__len__()
        return frames_tensor, cap_tensor, cap_bow, index, cap_id, video_id, tokens_tensor, segments_tensors, caption_text