Example #1
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		current_sample.qa_id = data['qa_id']

		# process question
		question = data["question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['answer']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index
		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		if self.config.spatial:
			point = data['point']
			# current_sample.point = point
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		return current_sample
Example #2
0
    def extract(self):
        os.makedirs(self.out_dir, exist_ok=True)

        word_count = Counter()

        texts = self.get_text()
        text_lengths = [None] * len(texts)

        for inx, text in enumerate(texts):
            words = tokenize(text)
            text_lengths[inx] = len(words)
            word_count.update(words)

        # UNK token will added on fly if you use Vocab class in core/text
        vocabulary = [w[0] for w in word_count.items() if w[1] >= self.min_freq]
        vocabulary.sort()

        self.save_vocabulary(vocabulary)

        print("min text len=", min(text_lengths))
        print("max text len=", max(text_lengths))
Example #3
0
 def build(self, annotations):
     targets = []
     for idx, annotation in enumerate(annotations):
         image_id = annotation[0].split('.')[0]
         image_name = image_id
         caption_str = annotation[1]
         caption_tokenes = tokenize(caption_str)
         caption_tokenes = ['<s>'] + caption_tokenes + ['</s>']
         reference_tokens = [caption_tokenes]
         feature_path = image_id + '.npy'
         target = {
             # 'image_id': image_id,
             'image_id': idx,
             'image_name': image_name,
             'caption_str': caption_str,
             'caption_tokens': caption_tokenes,
             'reference_tokens': reference_tokens,
             'feature_path': feature_path
         }
         targets.append(target)
     return targets
Example #4
0
    def get_item(self, idx):
        data = self.questions[idx]

        # Each call to get_item from dataloader returns a Sample class object which
        # collated by our special batch collator to a SampleList which is basically
        # a attribute based batch in layman terms
        current_sample = Sample()

        question = data["question"]
        tokens = tokenize(question, keep=[";", ","], remove=["?", "."])
        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        processed = self.answer_processor({"answers": [data["answer"]]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"]

        image_path = os.path.join(self.image_path, data["image_filename"])
        image = np.true_divide(Image.open(image_path).convert("RGB"), 255)
        image = image.astype(np.float32)
        current_sample.image = torch.from_numpy(image.transpose(2, 0, 1))

        return current_sample
Example #5
0
 def load_item(self, idx):
     sample = Sample()
     image_id = self.annotations[idx][0]
     image_folder = image_id.split('_')[0]
     caption = self.annotations[idx][1]
     tokens = tokenize(caption)
     tokens = ['<s>'] + tokens + ['</s>']
     # use text_processor to process caption
     # pad sequence, convert token to indices and add SOS, EOS token
     # text_processor already contains a pre-processor to tokenize caption
     caption_p = self.text_processor({'tokens': tokens})
     sample.text = caption_p['text']
     sample.caption_len = torch.tensor(len(tokens), dtype=torch.int)
     # sample.target = caption_p['text']
     sample.answers = torch.stack([caption_p['text']])
     # generate image features
     image_path = os.path.join(self.image_dir, image_folder, image_id)
     image, image_scale = self._image_transform(image_path)
     with torch.no_grad():
         image_features = self.feature_extractor([image], [image_scale])
     image_features = image_features[0]
     sample.image_feature_0 = image_features.cpu()
     return sample
Example #6
0
	def get_item(self, idx):

		data = self.vqamb_data[idx]

		current_sample = Sample()

		# store queston and image id
		current_sample.img_id = data['id']
		# current_sample.qa_id = data['qa_id']

		# store points
		current_sample.point = data['point'] # data['points']
		bbox = data['bbox']
		current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']])

		# process question
		question = data["pt_question"]
		tokens = tokenize(question, remove=["?"], keep=["'s"])

		processed = self.text_processor({"tokens": tokens})
		current_sample.text = processed["text"]

		# process answers
		processed = self.answer_processor({"answers": [data['ans']]})
		current_sample.answers = processed["answers"]
		current_sample.targets = processed["answers_scores"][1:] # remove unknown index

		# Detectron features ----------------
		# TODO: read in detectron image instead if detectron is to be built
		detectron_path = self.detectron_folder + str(data['id'])
		point = data['point'] # point = data['points'][0]
		if 'pt' in self.detectron_folder:
			detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
		detectron_path += '.pt'
		
		detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu'))

		# Pad features to fixed length
		if self.config.pad_detectron:
			if detectron_feat.shape[0] > 100:
				detectron_feat = detectron_feat[:100]
			elif detectron_feat.shape[0] < 100:
				pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1])
				detectron_feat = torch.cat([detectron_feat, pad], dim=0)

		current_sample.image_feature_0 = detectron_feat
		# ---------------------------------------------

		# read in bounding boxes (hardcoded for now)
		
		bbox_path = ''
		bbox_path  += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.pt_bbox = bboxes

		# read in image bounding boxes
		bbox_path = ''
		bbox_path  += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt'
		bboxes = torch.load(bbox_path, map_location=torch.device('cpu'))

		if bboxes.shape[0] > 100:
			bboxes = bboxes[:100]
		elif bboxes.shape[0] < 100:
			pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1])
			bboxes = torch.cat([bboxes, pad], dim=0)

		current_sample.img_bbox = bboxes
		
		# Context features --------------------
		if self.config.use_context:
			context_path = self.context_folder + str(data['id'])
			context_path += ',' + str(point['x']) + ',' + str(point['y'])
			context_path += '.pt'

			context_feat = torch.load(context_path, map_location=torch.device('cpu'))
			context_feat = context_feat.squeeze()
			orig_dim = context_feat.shape[0]

			if self.config.pad_context:
				if context_feat.shape[0] > 100:
					context_feat = context_feat[:100]
				elif context_feat.shape[0] < 100:
					pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1])
					context_feat = torch.cat([context_feat, pad], dim=0)

			current_sample.context_feature_0 = context_feat
		# ---------------------------------------------

		return current_sample
Example #7
0
    def build(self):
        annotations_file = self.args.data_file
        image_dir = self.args.image_root
        
        # visdial_json_file = os.path.join(
        #     self.args.data_dir,
        #     "visdial_%.1f_%s.json" % (self.args.version, self.args.set_type),
        # )
        data = None

        with open(annotations_file, "r") as f:
            data = json.load(f)

        # final_questions = self.get_tokens(data["questions"])
        # final_answers = self.get_tokens(data["answers"])
        # dialogs = data["dialogs"]
        # dialogs_with_features = self.parse_dialogs(dialogs)

        # reference_tokens = []
        # caption_tokens = []
        # image_names = []
        # feature_paths = []
        # image_ids = []
        # caption_ids = []
        # caption_strs = []
               
        all_data = []
        # training_data = []
        # validation_data = []
        all_data.append({"metadata": 'youcookII'})
        # training_data.append({"metadata": 'youcookII', "subset": 'training'})
        # validation_data.append({"metadata": 'youcookII', "subset": 'validation'})
        counter = 0
        for video in data["database"]:
            for i in data["database"][video]["annotations"]:
                aDict = {}
                vid_seg = str(video) +  "_" + str(i['id'])
                feature_path = glob.glob(os.path.join(image_dir, vid_seg + "*.npy"))
                if len(feature_path) != 0:
                    feature_path = os.path.basename(feature_path[0])
                    print("DEBUG feature_path:", feature_path)
                    # sys.exit()
                    image_name = feature_path.rstrip(".npy")
                    image_id = counter
                    caption_id = counter
                    caption_str = i["sentence"]
                    # caption_token_list = []
                    caption_token_list = tokenize(caption_str)
                    caption_token_list.insert(0, "<s>")
                    caption_token_list.append("</s>")
          
                    # reference_tokens.append([caption_token_list])
                    # caption_tokens.append(caption_token_list)
                    # caption_strs.append(caption_str)
                    # caption_ids.append(caption_id)
                    # image_ids.append(image_id)
                    # image_names.append(image_name)
                    # feature_paths.append(feature_path)

                    aDict["reference_tokens"] = [caption_token_list]
                    aDict["caption_tokens"] = caption_token_list
                    aDict["caption_str"] = caption_str
                    aDict["caption_id"] = caption_id
                    aDict["image_id"] = image_id
                    aDict["image_name"] = image_name
                    aDict["feature_path"] = feature_path

                    # print("DEBUG subset:", data["database"][video]["subset"])
                    # sys.exit()
            
                    # if str(data["database"][video]["subset"]) == "training":
                    #     training_data.append(aDict)
                    # elif str(data["database"][video]["subset"]) == "validation":
                    #     validation_data.append(aDict)

                    all_data.append(aDict)
                    counter+=1
                    
        """
        imdb = {
            # "questions": final_questions,
            # "answers": final_answers,
            # "dialogs": dialogs_with_features,
            "reference_tokens": reference_tokens,
            "caption_tokens": caption_tokens,
            "image_name": image_names,
            "feature_path": feature_paths,
            "image_id": image_ids,
            "caption_id": caption_ids,
            "caption_str": caption_strs,
        }

        np_data = np.array(list(zip( \
            list(zip(["reference_tokens"]*len(reference_tokens), reference_tokens)),
            list(zip(["caption_tokens"]*len(caption_tokens), caption_tokens)),
            list(zip(["image_name"]*len(image_names), image_names)),
            list(zip(["feature_path"]*len(feature_paths), feature_paths)),
            list(zip(["image_id"]*len(image_ids), image_ids)),
            list(zip(["caption_id"]*len(caption_ids), caption_ids)),
            list(zip(["caption_str"]*len(caption_strs), caption_strs)),
            )))
        """

        np.save(self.args.out_file, np.array(all_data))
Example #8
0
    def test_tokenize(self):
        tokens = text_utils.tokenize(self.TOKENIZE_EXAMPLE)

        self.assertEqual(list(tokens), self.TOKENS)
Example #9
0
    def get_item(self, idx):

        data = self.vqamb_data[idx]

        current_sample = Sample()

        # store queston and image id
        current_sample.img_id = data['id']
        current_sample.qa_id = data['qa_index']

        # store points
        current_sample.points = data['points']

        obj = data['all_objs'][0]
        xmin, ymin, xmax, ymax = obj['x'], obj[
            'y'], obj['x'] + obj['w'], obj['y'] + obj['h']
        current_sample.gt_bbox = torch.Tensor([xmin, ymin, xmax, ymax])

        # process question
        question = data["question"]
        tokens = tokenize(question, remove=["?"])

        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        # process answers
        processed = self.answer_processor({"answers": data['all_ans']})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"][
            1:]  # remove unknown index

        # Detectron features ----------------
        # TODO: read in detectron image instead if detectron is to be built
        detectron_path = self.detectron_folder + str(data['id'])
        bbox_path = self.bbox_folder + str(data['id'])
        if 'pt' in self.detectron_folder:
            point = data['points'][0]
            detectron_path += ',' + str(point['x']) + ',' + str(point['y'])
            bbox_path += ',' + str(point['x']) + ',' + str(point['y'])

        detectron_path += '.pt'
        bbox_path += '.pt'

        detectron_feat = torch.load(
            detectron_path, map_location=torch.device('cpu')).squeeze()
        # bbox_feat = torch.load(bbox_path, map_location=torch.device('cpu')).squeeze()
        '''if detectron_feat.shape[0] == 2048:
			detectron_feat = detectron_feat.unsqueeze(0)
			bbox_feat = bbox_feat.unsqueeze(0)
		'''
        '''
		if self.config.grid:
			 detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T
		'''
        # x_down = max(int(round(pt['x']/600)), 18)
        # y_down = int(round(pt['y']/800), 25)

        # preproessing for grid features only
        # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T

        # Pad features to fixed length
        if self.config.grid:
            MAX_FEAT = 608

        else:
            MAX_FEAT = 100

        if self.config.pad_detectron:
            if detectron_feat.shape[0] > MAX_FEAT:
                detectron_feat = detectron_feat[:MAX_FEAT]
                # bbox_feat = bbox_feat[:MAX_FEAT]
            elif detectron_feat.shape[0] < MAX_FEAT:
                pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0],
                                  detectron_feat.shape[1])
                detectron_feat = torch.cat([detectron_feat, pad], dim=0)
                pad = torch.zeros(MAX_FEAT - bbox_feat.shape[0],
                                  bbox_feat.shape[1])
                bbox_feat = torch.cat([bbox_feat, pad], dim=0)
        '''
		else:
			if detectron_feat.dim() > 1:
				detectron_feat = torch.zeros(2048)
		'''
        # current_sample.bbox = bbox_feat
        current_sample.image_feature_0 = detectron_feat
        # ---------------------------------------------

        return current_sample
Example #10
0
    def get_item(self, idx):

        data = self.objpart_data[idx]

        current_sample = Sample()

        # store queston and image id
        current_sample.img_id = data['id']
        # current_sample.qa_id = data['qa_id']

        if data['ans'] == 'part':
            current_sample.part = 1

        else:
            current_sample.part = 0

        # store points
        current_sample.point = data['point']

        # process question
        question = data["question"]
        tokens = tokenize(question, remove=["?"])

        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        # process answers
        processed = self.answer_processor({"answers": [data['ans']]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"][
            1:]  # remove unknown index

        # Detectron features ----------------
        # TODO: read in detectron image instead if detectron is to be built
        detectron_path = self.detectron_folder + str(data['id'])
        if 'pt' in self.detectron_folder:  # hacky way of assessing point supervision
            point = data['point']
            detectron_path += ',' + str(point['x']) + ',' + str(point['y'])

        detectron_path += '.pt'

        detectron_feat = torch.load(
            detectron_path, map_location=torch.device('cpu')).squeeze()

        # hardcode bounding box and read it

        # x_down = max(int(round(pt['x']/600)), 18)
        # y_down = int(round(pt['y']/800), 25)

        # preproessing for grid features only
        # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T

        # Pad features to fixed length
        MAX_FEAT = 100

        if self.config.pad_detectron:
            if detectron_feat.shape[0] > MAX_FEAT:
                detectron_feat = detectron_feat[:MAX_FEAT]
            elif detectron_feat.shape[0] < MAX_FEAT:
                pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0],
                                  detectron_feat.shape[1])
                detectron_feat = torch.cat([detectron_feat, pad], dim=0)
        '''
		else:
			if detectron_feat.dim() > 1:
				detectron_feat = torch.zeros(2048)
		'''
        current_sample.image_feature_0 = detectron_feat
        # ---------------------------------------------

        return current_sample
    def test_tokenize(self):
        tokens = text_utils.tokenize(self.SENTENCE)

        self.assertEqual(list(tokens), self.TOKENS)