Beispiel #1
0
class FvqaTestDataset(Dataset):
    def __init__(self, config, overfit=False):
        super().__init__()

        self.que_vocabulary = Vocabulary(config["dataset"]["word2id_path"])

        self.image_features = []
        self.image_boxes = []
        self.image_labels = []
        self.image_captions = []

        self.questions = []
        self.questions_ids = []
        self.questions_vecs = []
        self.answers = []
        self.config = config

        with open(config['dataset']['test']['test_qids'], 'r') as f:
            self.test_qids = json.load(f)

        with open(config['dataset']['test']['test_questions'], 'r') as f:
            self.test_questions = json.load(f)

        with open(config['dataset']['test']['test_answers'], 'r') as f:
            self.test_answers = json.load(f)

        with open(config['dataset']['test']['test_gt_facts'], 'r') as f:
            self.test_gt_facts = json.load(f)

        with open(config['dataset']['test']['test_captions'], 'r') as f:
            self.test_captions = json.load(f)

        with open(config['dataset']['test']['test_semantic_graph'], 'rb') as f:
            self.semantic_graphs = pickle.load(f, encoding='iso-8859-1')

        with open(config['dataset']['test']['test_labels'], 'r') as f:
            self.test_labels = json.load(f)

        with open(config['dataset']['test']['test_whs'], 'r') as f:
            self.test_whs = json.load(f)

        with open(config['dataset']['test']['test_top100_facts_graph'],
                  'rb') as f:
            self.test_top_facts = pickle.load(f, encoding='iso-8859-1')

        self.test_features = np.load(
            config['dataset']['test']['test_features'])

        self.test_bboxes = np.load(config['dataset']['test']['test_bboxes'])

        if overfit:
            self.test_qids = self.test_qids[:100]
            self.test_questions = self.test_questions[:100]
            self.test_answers = self.test_answers[:100]
            self.test_gt_facts = self.test_gt_facts[:100]

            self.test_top_facts = self.test_top_facts[:100]
            self.test_captions = self.test_captions[:100]
            self.test_bboxes = self.test_bboxes[:100]
            self.test_features = self.test_features[:100]
            self.test_whs = self.test_whs[:100]
            self.semantic_graphs = self.semantic_graphs[:100]

    def __getitem__(self, index):
        test_id = self.test_qids[index]
        test_question = self.test_questions[index]
        test_answer = self.test_answers[index]
        test_gt_fact = self.test_gt_facts[index]

        test_top_facts = self.test_top_facts[index]
        test_captions = self.test_captions[index]
        test_bboxes = self.test_bboxes[index]  # (36,4)
        test_features = torch.tensor(self.test_features[index])
        semantic_graph = self.semantic_graphs[index]

        w = self.test_whs[index][0]
        h = self.test_whs[index][1]
        img_relations = torch.zeros(36, 36, 7)

        for i in range(36):
            for j in range(36):
                xi = test_bboxes[i][0]
                yi = test_bboxes[i][1]
                wi = test_bboxes[i][2]
                hi = test_bboxes[i][3]
                xj = test_bboxes[j][0]
                yj = test_bboxes[j][1]
                wj = test_bboxes[j][2]
                hj = test_bboxes[j][3]

                r1 = (xj - xi) / (wi * hi)**0.5
                r2 = (yj - yi) / (wi * hi)**0.5
                r3 = wj / wi
                r4 = hj / hi
                r5 = (wj * hj) / wi * hi
                r6 = sqrt((xj - xi)**2 + (yj - yi)**2) / sqrt(w**2 + h**2)
                r7 = atan2(yj - yi, xj - xi)

                rel = torch.tensor([r1, r2, r3, r4, r5, r6, r7])
                img_relations[i][j] = rel

        # 归一化
        if self.config['dataset']["img_norm"]:
            test_features = normalize(test_features, dim=0, p=2)

        # 对 question 转化为 index
        question_length = len(test_question.split())

        q_indices = self.que_vocabulary.to_indices(test_question.split())
        test_question = self.pad_sequences(q_indices)

        item = {}

        item['id'] = test_id  # scalar
        item['question'] = test_question  # (max_len,)

        item['question_length'] = question_length  # scalar

        item['features'] = test_features  # (36,2048)

        item['img_relations'] = img_relations  # (36,36,7)

        item['facts_num_nodes'] = len(test_top_facts['nodes'])  # scalar

        item['facts_features'] = test_top_facts['features']  # (num,2048)
        item['facts_e1ids'] = test_top_facts['e1ids']  # (num_edges,)
        item['facts_e2ids'] = test_top_facts['e2ids']  # (num_edges,)

        item['facts_answer'] = test_top_facts['answer']
        item['facts_answer_id'] = test_top_facts['answer_id']  # scalar
        item['semantic_num_nodes'] = len(semantic_graph['nodes'])
        item['semantic_n_features'] = semantic_graph['n_features']
        item['semantic_e1ids'] = semantic_graph['e1ids']
        item['semantic_e2ids'] = semantic_graph['e2ids']
        item['semantic_e_features'] = semantic_graph['e_features']
        return item

    def __len__(self):
        return len(self.test_qids)

    def pad_sequences(self, sequence):

        sequence = sequence[:self.config['dataset']['max_sequence_lengtn']]

        padding = np.zeros(self.config['dataset']['max_sequence_lengtn'])
        padding[:len(sequence)] = np.array(sequence)
        return torch.tensor(padding)
Beispiel #2
0
class FvqaTrainDataset(Dataset):
    def __init__(self, config, overfit=False):
        super().__init__()

        self.que_vocabulary = Vocabulary(config["dataset"]["word2id_path"])

        self.image_features = []
        self.image_boxes = []
        self.image_labels = []
        self.image_captions = []

        self.questions = []
        self.questions_ids = []
        self.questions_vecs = []
        self.answers = []
        self.config = config

        # 问题的 id
        with open(config['dataset']['train']['train_qids'], 'r') as f:
            self.train_qids = json.load(f)
        # 问题
        with open(config['dataset']['train']['train_questions'], 'r') as f:
            self.train_questions = json.load(f)
        # 答案
        with open(config['dataset']['train']['train_answers'], 'r') as f:
            self.train_answers = json.load(f)
        # gt fact
        with open(config['dataset']['train']['train_gt_facts'], 'r') as f:
            self.train_gt_facts = json.load(f)
        # caption
        with open(config['dataset']['train']['train_captions'], 'r') as f:
            self.train_captions = json.load(f)
        # semantic graph
        with open(config['dataset']['train']['train_semantic_graph'], 'rb') as f:
            self.semantic_graphs = pickle.load(f, encoding='iso-8859-1')
        # 图像bbox 对应的 label
        with open(config['dataset']['train']['train_labels'], 'r') as f:
            self.train_labels = json.load(f)
        # 图像的长宽
        with open(config['dataset']['train']['train_whs'], 'r') as f:
            self.train_whs = json.load(f)
        # 抽取到的 facts
        # with open(config['dataset']['train']['train_facts_graph'], 'rb') as f:
        #     self.train_facts = pickle.load(f, encoding='iso-8859-1')
        with open(config['dataset']['train']['train100_facts_graph'],
                  'rb') as f:
            self.train_top_facts = pickle.load(f, encoding='iso-8859-1')
        # 图像 bbox 的特征
        self.train_features = np.load(
            config['dataset']['train']['train_features'])
        # 图像的 bbox 几何信息
        self.train_bboxes = np.load(config['dataset']['train']['train_bboxes'])

        if overfit:
            self.train_qids = self.train_qids[:100]
            self.train_questions = self.train_questions[:100]
            self.train_answers = self.train_answers[:100]
            self.train_gt_facts = self.train_gt_facts[:100]
            # self.train_facts = self.train_facts[:100]
            self.train_top_facts = self.train_top_facts[:100]
            self.train_captions = self.train_captions[:100]
            self.train_bboxes = self.train_bboxes[:100]
            self.train_features = self.train_features[:100]
            self.train_whs = self.train_whs[:100]
            self.semantic_graphs=self.semantic_graphs[:100]

    def __getitem__(self, index):
        train_id = self.train_qids[index]
        train_question = self.train_questions[index]
        train_answer = self.train_answers[index]
        train_gt_fact = self.train_gt_facts[index]
        # train_facts = self.train_facts[index]
        train_top_facts = self.train_top_facts[index]
        train_captions = self.train_captions[index]
        train_bboxes = self.train_bboxes[index]  # (36,4)
        train_features = torch.tensor(self.train_features[index])
        semantic_graph=self.semantic_graphs[index]

        # cal relation info (36,36,7)
        w = self.train_whs[index][0]
        h = self.train_whs[index][1]
        img_relations = torch.zeros(36, 36, 7)

        for i in range(36):
            for j in range(36):
                xi = train_bboxes[i][0]
                yi = train_bboxes[i][1]
                wi = train_bboxes[i][2]
                hi = train_bboxes[i][3]
                xj = train_bboxes[j][0]
                yj = train_bboxes[j][1]
                wj = train_bboxes[j][2]
                hj = train_bboxes[j][3]

                r1 = (xj - xi) / (wi * hi) ** 0.5
                r2 = (yj - yi) / (wi * hi) ** 0.5
                r3 = wj / wi
                r4 = hj / hi
                r5 = (wj * hj) / wi * hi
                r6 = sqrt((xj - xi) ** 2 + (yj - yi) ** 2) / sqrt(w ** 2 + h ** 2)
                r7 = atan2(yj - yi, xj - xi)

                rel = torch.tensor([r1, r2, r3, r4, r5, r6, r7])
                img_relations[i][j] = rel

        # 归一化
        if self.config['dataset']["img_norm"]:
            train_features = normalize(train_features, dim=0, p=2)

        # 对 question 转化为 index
        question_length = len(train_question.split())
        # train_question=self.que_vocabulary.to_indices(train_question.split())
        q_indices = self.que_vocabulary.to_indices(train_question.split())
        train_question = self.pad_sequences(q_indices)

        item = {}
        # question
        item['id'] = train_id  # scalar
        item['question'] = train_question  # (max_len,)
        # item['gt_fact'] = train_gt_fact  # [e1,e2,r]
        # item['answer'] = train_answer  # string
        item['question_length'] = question_length  # scalar
        # image
        item['features'] = train_features  # (36,2048)
        # item['bboxes'] = train_bboxes  # (36,4)
        # item['captions'] = train_captions  # ()
        item['img_relations'] = img_relations  # (36,36,7)
        # fact graph
        item['facts_num_nodes'] = len(train_top_facts['nodes'])  # scalar
        # item['facts_nodes'] = train_top_facts['nodes']  # (num_nodes,)
        item['facts_features'] = train_top_facts['features']  # (num,2048)
        item['facts_e1ids'] = train_top_facts['e1ids']  # (num_edges,)
        item['facts_e2ids'] = train_top_facts['e2ids']  # (num_edges,)
        # (num_nodes,)  one-hot
        item['facts_answer'] = train_top_facts['answer']
        item['facts_answer_id'] = train_top_facts['answer_id']  # scalar
        item['semantic_num_nodes']=len(semantic_graph['nodes'])
        item['semantic_n_features']=semantic_graph['n_features']
        item['semantic_e1ids']=semantic_graph['e1ids']
        item['semantic_e2ids']=semantic_graph['e2ids']
        item['semantic_e_features']=semantic_graph['e_features']
        return item

    def __len__(self):
        return len(self.train_qids)

    def pad_sequences(self, sequence):
        # 超出的裁剪
        sequence = sequence[:self.config['dataset']['max_sequence_lengtn']]
        # 没超出的padding
        padding = np.zeros(self.config['dataset']['max_sequence_lengtn'])
        padding[:len(sequence)] = np.array(sequence)
        return torch.tensor(padding)