Exemple #1
0
    def get_MC_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros([batch_size]
                                               + list(self.get_video_feature_dimension()),
                                               dtype=np.float32)
        batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)
        batch_answer = np.zeros([batch_size], dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_vid_length], dtype=np.uint32)
        batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)])
        batch_row_indices = np.asarray([-1] * batch_size)

        for k in xrange(batch_size):
            key = chunk[k]

            MC_dict = self.get_MC_dict(key)
            candidates = MC_dict['candidates']
            raw_sentences = MC_dict['raw_sentences']
            answer = MC_dict['answer']

            video_feature = self.get_video_feature(key)
            candidates_matrix = self.get_MC_matrix(candidates)

            video_mask = self.get_video_mask(video_feature)
            candidates_mask = self.get_MC_mask(candidates)

            batch_video_feature_convmap[k] = data_util.pad_video(video_feature,
                                                                    self.get_video_feature_dimension())
            batch_candidates[k] = candidates_matrix
            batch_raw_sentences[k, :] = raw_sentences
            batch_answer[k] = 0
            batch_video_mask[k] = video_mask
            batch_candidates_mask[k] = candidates_mask
            batch_row_indices[k] = MC_dict['row_indices']

            if answer != 0:
                batch_candidates[k, [0, answer], :] = batch_candidates[k, [answer, 0], :]
                batch_candidates_mask[k, [0, answer], :] = batch_candidates_mask[k, [answer, 0], :]
                batch_raw_sentences[k, [0, answer]] = batch_raw_sentences[k, [answer, 0]]


        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'candidates': batch_candidates,
            'raw_sentences': batch_raw_sentences,
            'answer': batch_answer,
            'video_mask': batch_video_mask,
            'candidates_mask': batch_candidates_mask,
            'row_indices': batch_row_indices
        }
        return ret
Exemple #2
0
    def get_FIB_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()),
            dtype=np.float32)
        batch_blank_sent = np.zeros([batch_size, self.max_length],
                                    dtype=np.uint32)
        batch_answer = np.zeros([batch_size, self.word_matrix.shape[0]],
                                dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_vid_length],
                                    dtype=np.uint32)
        batch_blank_sent_mask = np.zeros([batch_size, self.max_length],
                                         dtype=np.uint32)
        batch_reverse_blank_sent_mask = np.zeros([batch_size, self.max_length],
                                                 dtype=np.uint32)
        batch_debug_sent = np.asarray([None] * batch_size)

        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            blank_sent = self.get_blank_sentence(key)
            answer = self.get_blank_answer(key)

            video_mask = self.get_video_mask(video_feature)
            blank_sent_mask = self.get_blank_sent_mask(blank_sent)
            reverse_blank_sent_mask = self.get_reverse_blank_sent_mask(
                blank_sent)

            batch_video_feature_convmap[k] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())

            batch_blank_sent[k, :len(blank_sent)] = blank_sent
            batch_answer[k] = answer

            batch_video_mask[k] = video_mask
            batch_blank_sent_mask[k] = blank_sent_mask
            batch_reverse_blank_sent_mask[k] = reverse_blank_sent_mask

            batch_debug_sent[k] = self.data_df.iloc[key]['sentence']

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'blank_sent': batch_blank_sent,
            'answer': batch_answer,
            'video_mask': batch_video_mask,
            'answer': batch_answer,
            'blank_sent_mask': batch_blank_sent_mask,
            'debug_sent': batch_debug_sent,
            'reverse_blank_sent_mask': batch_reverse_blank_sent_mask
        }
        return ret
Exemple #3
0
    def get_Count_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()),
            dtype=np.float32)

        # Question, Right most aligned
        batch_question = np.zeros([batch_size, self.max_length],
                                  dtype=np.uint32)
        batch_question_right = np.zeros([batch_size, self.max_length],
                                        dtype=np.uint32)
        batch_video_mask = np.zeros([batch_size, self.max_length],
                                    dtype=np.uint32)
        batch_question_mask = np.zeros([batch_size, self.max_length],
                                       dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_answer = np.zeros([batch_size, 1])

        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            video_mask = self.get_video_mask(video_feature)

            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())

            batch_video_mask[k] = video_mask

            answer = max(self.get_Count_answer(key), 1)

            question = self.get_Count_question(key)
            question_mask = self.get_Count_question_mask(question)
            # Left align
            batch_question[k, :len(question)] = question
            # Right align
            batch_question_right[k, -len(question):] = question
            batch_question_mask[k] = question_mask
            batch_answer[k] = answer
            batch_debug_sent[k] = self.data_df.loc[key, 'question']

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'question_words': batch_question,
            'question_words_right': batch_question_right,
            'video_mask': batch_video_mask,
            'question_mask': batch_question_mask,
            'answer': batch_answer,
            'debug_sent': batch_debug_sent
        }
        return ret
Exemple #4
0
    def get_CAP_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()),
            dtype=np.float32)
        batch_caption = np.zeros([batch_size, self.max_length],
                                 dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_vid_length],
                                    dtype=np.uint32)
        batch_caption_mask = np.zeros([batch_size, self.max_length],
                                      dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)

        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            video_mask = self.get_video_mask(video_feature)

            batch_video_feature_convmap[k] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            batch_video_mask[k] = video_mask

            if self.dataset_name != 'blind':
                try:
                    caption = self.get_description(key)
                    caption_mask = self.get_sentence_mask(caption)
                except:
                    print key
                    sys.exit()
                batch_caption[k, :len(caption)] = caption
                batch_caption_mask[k] = caption_mask
                batch_debug_sent[k] = self.data_df.iloc[key]['description']

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'caption_words': batch_caption,
            'video_mask': batch_video_mask,
            'caption_mask': batch_caption_mask,
            'debug_sent': batch_debug_sent
        }
        return ret
Exemple #5
0
    def get_RET_result(self, y_keys, x_keys, neg=True):
        batch_size = len(y_keys)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()),
            dtype=np.float32)
        batch_caption = np.zeros([batch_size, self.max_length],
                                 dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_length],
                                    dtype=np.uint32)
        batch_caption_mask = np.zeros([batch_size, self.max_length],
                                      dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)

        for k in xrange(batch_size):
            x_key = x_keys[k]
            y_key = y_keys[k]
            video_feature = self.get_video_feature(y_key)
            video_mask = self.get_video_mask(video_feature)
            caption = self.get_description(x_key)
            caption_mask = self.get_sentence_mask(caption)
            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            batch_caption[k, :len(caption)] = caption
            batch_video_mask[k] = video_mask
            batch_caption_mask[k] = caption_mask

            batch_debug_sent[k] = self.data_df.iloc[x_key]['description']

        ret = {
            'ids': y_key,
            'video_features': batch_video_feature_convmap,
            'caption_words': batch_caption,
            'video_mask': batch_video_mask,
            'caption_mask': batch_caption_mask,
            'debug_sent': batch_debug_sent
        }
        return ret
Exemple #6
0
    def get_Trans_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32)

        batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)
        batch_candidates_right = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)
        batch_answer = np.zeros([batch_size], dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)]) # [batch_size, 5]
        batch_row_indices = np.asarray([-1] * batch_size)

        batch_questions = []

        for k in xrange(batch_size):
            key = chunk[k]

            MC_dict = self.get_Trans_dict(key)
            candidates = MC_dict['candidates']
            raw_sentences = MC_dict['raw_sentences']
            answer = int(MC_dict['answer'])
            question = MC_dict['question']


            video_feature = self.get_video_feature(key)
            candidates_matrix = self.get_Trans_matrix(candidates)
            candidates_matrix_right = self.get_Trans_matrix(candidates, is_left=False)

            video_mask = self.get_video_mask(video_feature)
            candidates_mask = self.get_Trans_mask(candidates)

            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())

            batch_candidates[k] = candidates_matrix
            batch_candidates_right[k] = candidates_matrix_right
            batch_raw_sentences[k, :] = raw_sentences
            batch_answer[k] = answer
            batch_video_mask[k] = video_mask
            batch_candidates_mask[k] = candidates_mask
            batch_row_indices[k] = MC_dict['row_indices']
            batch_questions.append(question)

            batch_debug_sent[k] = self.data_df.loc[key, 'a'+str(int(answer+1))]

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'candidates': batch_candidates,
            'candidates_right': batch_candidates_right,
            'answer': batch_answer,
            'raw_sentences': batch_raw_sentences,
            'video_mask': batch_video_mask,
            'candidates_mask': batch_candidates_mask,
            'debug_sent': batch_debug_sent,
            'row_indices' : batch_row_indices,
            'question': batch_questions,
        }
        return ret
Exemple #7
0
    def get_FrameQA_result(self, chunk):
        batch_size = len(chunk)
        """
        下面的[batch_size] + list(self.get_video_feature_dimension())
        最后的返回就是(batch_size,7,7,1028)
        """
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32)

        # Question, Right most aligned
        batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        """
        这里debug_sent还不知道是什么意思
        """
        batch_debug_sent = np.asarray([None] * batch_size)
        """
        之前读取answer的时候,如果一个answer是由好几个单词组成的,这里不会进行分词
        """
        batch_answer = np.zeros([batch_size, 1])
        batch_answer_type = np.zeros([batch_size, 1])
        questions = []

        for k in xrange(batch_size):
            key = chunk[k]
            """
            这里的维度是
             [video_length,7, 7, 2048],也就是这里的时候,这个每个视频的video_length
             还不是一样的
            """
            video_feature = self.get_video_feature(key)


            video_mask = self.get_video_mask(video_feature)


            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            # 这个就是记录一下,我这个video是补全了还是cut了
            batch_video_mask[k] = video_mask

            """
            这里根据csv表格,会对answer type 进行一个分类,2代表的是颜色,0好像是物件吧
            """
            answer, answer_type = self.get_answer(key)
            if str(answer) in self.ans2idx:
                answer = self.ans2idx[answer]
            else:
                # unknown token, check later
                answer = 1

            question = self.get_question(key)
            # 这里question也会有长度不一的情况,这里max_sequence_length=35
            question_mask = self.get_question_mask(question)
            # Left align
            """
            batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32)
            所以question其实就是一个二维的矩阵
            """
            batch_question[k, :len(question)] = question
            # Right align
            batch_question_right[k, -len(question):] = question
            #questions.append(question)
            #batch_question_mask.append(len(question)) #question_mask
            batch_question_mask[k] = question_mask
            question_pad = np.zeros([self.max_length])
            question_pad[:len(question)] = question
            # 这个questions变量,之后并没有使用,所以不用管了
            questions.append(question_pad)
            batch_answer[k] = answer
            batch_answer_type[k] = float(int(answer_type))
            batch_debug_sent[k] = self.data_df.loc[key, 'question']

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'question_words': batch_question,
            'question_words_right': batch_question_right,
            'video_mask': batch_video_mask,
            'question_mask': batch_question_mask,
            'answer': batch_answer,
            'answer_type': batch_answer_type,
            'debug_sent': batch_debug_sent
        }
        return ret
Exemple #8
0
    def get_FrameQA_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()),
            dtype=np.float32)

        # Question, Right most aligned
        batch_question = np.zeros([batch_size, self.max_length],
                                  dtype=np.uint32)
        batch_question_right = np.zeros([batch_size, self.max_length],
                                        dtype=np.uint32)
        batch_video_mask = np.zeros([batch_size, self.max_length],
                                    dtype=np.uint32)
        batch_question_mask = np.zeros([batch_size, self.max_length],
                                       dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_answer = np.zeros([batch_size, 1])
        batch_answer_type = np.zeros([batch_size, 1])
        questions = []

        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            video_mask = self.get_video_mask(video_feature)

            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            batch_video_mask[k] = video_mask

            answer, answer_type = self.get_answer(key)
            if str(answer) in self.ans2idx:
                answer = self.ans2idx[answer]
            else:
                # unknown token, check later
                answer = 1
            question = self.get_question(key)
            question_mask = self.get_question_mask(question)
            # Left align
            batch_question[k, :len(question)] = question
            # Right align
            batch_question_right[k, -len(question):] = question
            #questions.append(question)
            #batch_question_mask.append(len(question)) #question_mask
            batch_question_mask[k] = question_mask
            question_pad = np.zeros([self.max_length])
            question_pad[:len(question)] = question
            questions.append(question_pad)
            batch_answer[k] = answer
            batch_answer_type[k] = float(int(answer_type))
            batch_debug_sent[k] = self.data_df.loc[key, 'question']

        ret = {
            'ids': chunk,
            'video_features': batch_video_feature_convmap,
            'question_words': batch_question,
            'question_words_right': batch_question_right,
            'video_mask': batch_video_mask,
            'question_mask': batch_question_mask,
            'answer': batch_answer,
            'answer_type': batch_answer_type,
            'debug_sent': batch_debug_sent
        }
        return ret