def get_scalars_with_smooth(self, window_size=30): result = imixEasyDict() for key, value in self._scalars.items(): result[key] = self._history[key].median( window_size) if self._smoothing_hints[key] else value return result
def __init__(self, optimizer, *args, **kwargs): self._lambda_func = PythiaScheduler.lr_lambda_update from imix.utils.config import imixEasyDict self._global_config = imixEasyDict({'lr_config': {k: v for k, v in kwargs.items()}}) kwargs = {} super().__init__(optimizer, self.lr_lambda, *args, **kwargs)
def get_param() -> imixEasyDict: param = imixEasyDict() param.optimizer_cfg = copy.deepcopy(cfg) param.type = param.optimizer_cfg.pop('constructor', 'DefaultOptimizerConstructor') param.paramwise_cfg = param.optimizer_cfg.pop('paramwise_cfg', None) return param
def process(image_feature): image_info = imixEasyDict() image_loc, image_dim = image_feature.shape tmp_image_feat = np.zeros((self.max_loc, image_dim), dtype=np.float32) tmp_image_feat[0:image_loc, ] = image_feature[:self.max_loc, :] image_info.image_feature = torch.from_numpy(tmp_image_feat) image_info.max_features = torch.tensor(image_loc, dtype=torch.long) return image_info
def forward_train(self, examples, **kwargs): train_features = [ convert_example_to_features(example, self.max_seq_length, self.tokenizer) for example in examples ] # language Inputs input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).cuda() input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long).cuda() segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long).cuda() # Visual Inputs feats = torch.from_numpy( np.stack([f.visual_feats[0] for f in train_features])).cuda() pos = torch.from_numpy( np.stack([f.visual_feats[1] for f in train_features])).cuda() # Language Prediction lm_labels = torch.tensor([f.lm_label_ids for f in train_features], dtype=torch.long).cuda() # Visual Prediction obj_labels = {} for key in ('obj', 'attr', 'feat'): visn_labels = torch.from_numpy( np.stack([f.obj_labels[key][0] for f in train_features])).cuda() visn_mask = torch.from_numpy( np.stack([f.obj_labels[key][1] for f in train_features])).cuda() assert visn_labels.size(0) == visn_mask.size( 0) and visn_labels.size(1) == visn_mask.size(1) obj_labels[key] = (visn_labels, visn_mask) # Joint Prediction matched_labels = torch.tensor([f.is_matched for f in train_features], dtype=torch.long).cuda() ans = torch.from_numpy(np.stack([f.ans for f in train_features])).cuda() loss, losses, answer_score_logit = self.model(input_ids, segment_ids, input_mask, lm_labels, feats, pos, obj_labels, matched_labels, ans) output = imixEasyDict() output.loss = loss # total loss output.losses = losses # every loss output.scores = answer_score_logit return output
def _paramwise_cfg_params(self): paramwise = imixEasyDict() paramwise.bias_lr_mult = getattr(self.paramwise_cfg, 'bias_lr_mult', 1.0) paramwise.bias_decay_mult = getattr(self.paramwise_cfg, 'bias_decay_mult', 1.0) paramwise.norm_decay_mult = getattr(self.paramwise_cfg, 'norm_decay_mult', 1.0) paramwise.dwconv_decay_mult = getattr(self.paramwise_cfg, 'dwconv_decay_mult', 1.0) paramwise.bypass_duplicate = getattr(self.paramwise_cfg, 'bypass_duplicate', False) paramwise.custom_keys = getattr(self.paramwise_cfg, 'custom_keys', {}) paramwise.sorted_keys = sorted(sorted(paramwise.custom_keys.keys()), key=len, reverse=True) return paramwise
def get_cfg_param(data_cfg): params = imixEasyDict() params.batch_size = getattr(data_cfg, 'samples_per_gpu') params.num_workers = getattr(data_cfg, 'workers_per_gpu') params.drop_last = getattr(data_cfg, 'drop_last', False) params.pin_memory = getattr(data_cfg, 'pin_memory', False) params.sampler_cfg = getattr(data_cfg, 'sampler', None) params.batch_sampler_cfg = getattr(data_cfg, 'batch_sampler', None) params.shuffle = getattr(data_cfg, 'shuffle', False) params.collate_fn = getattr(data_cfg, 'collate_fn', None) params.worker_init_fn = worker_init_fn return params
def build_data_loader_by_epoch(dataset, cfg, is_training=True): def get_cfg_param(data_cfg): params = imixEasyDict() params.batch_size = getattr(data_cfg, 'samples_per_gpu') params.num_workers = getattr(data_cfg, 'workers_per_gpu') params.drop_last = getattr(data_cfg, 'drop_last', False) params.pin_memory = getattr(data_cfg, 'pin_memory', False) params.sampler_cfg = getattr(data_cfg, 'sampler', None) params.batch_sampler_cfg = getattr(data_cfg, 'batch_sampler', None) params.shuffle = getattr(data_cfg, 'shuffle', False) params.collate_fn = getattr(data_cfg, 'collate_fn', None) params.worker_init_fn = worker_init_fn return params params = get_cfg_param(cfg.train_data if is_training else cfg.test_data) sampler_cfg, batch_sampler_cfg = params.sampler_cfg, params.batch_sampler_cfg dataloader_param = { 'dataset': dataset, 'pin_memory': params.pin_memory, 'num_workers': params.num_workers, 'collate_fn': eval(params.collate_fn) if params.collate_fn else None, } if batch_sampler_cfg: batch_sampler = build_batch_sampler(batch_sampler_cfg, default_args={'dataset': dataset}) dataloader_param.update({'batch_sampler': batch_sampler}) else: if sampler_cfg: sampler_cfg = imixEasyDict({'type': sampler_cfg}) if isinstance( sampler_cfg, str) else sampler_cfg sampler = build_sampler(sampler_cfg, default_args={'dataset': dataset}) dataloader_param.update({'sampler': sampler}) dataloader_param.update({ 'batch_size': params.batch_size, 'drop_last': params.drop_last, 'shuffle': params.shuffle, }) return DataLoader(**dataloader_param)
def add_ocr_info(self, item_feature: ItemFeature, sample: ItemFeature): sample_info = item_feature if not self.use_ocr: # remove all OCRs from the sample # (i.e. make an empty OCR list) sample_info['ocr_tokens'] = [] sample_info['ocr_info'] = [] if 'ocr_normalized_boxes' in sample_info: sample_info['ocr_normalized_boxes'] = np.zeros((0, 4), np.float32) # clear OCR visual features if 'image_feature_1' in sample: sample.image_feature_1 = torch.zeros_like( sample.image_feature_1) return sample # Preprocess OCR tokens if hasattr(self, 'ocr_token_processor'): ocr_tokens = [ self.ocr_token_processor({'text': token})['text'] for token in sample_info['ocr_tokens'] ] else: ocr_tokens = sample_info['ocr_tokens'] # Get FastText embeddings for OCR tokens context = self.context_processor({'tokens': ocr_tokens}) sample.context = context['text'] sample.ocr_tokens = context['tokens'] sample.context_tokens = object_to_byte_tensor(context['tokens']) sample.context_feature_0 = context['text'] sample.context_info_0 = imixEasyDict() sample.context_info_0.max_features = context['length'] # Get PHOC embeddings for OCR tokens if hasattr(self, 'phoc_processor'): if self.phoc_processor is None: if item_feature.context_phoc is None: phoc_file_name = f'{item_feature.set_name}_qid_{item_feature.question_id}.json' context_phoc = self.get_phoc_feature( file_name=phoc_file_name) else: context_phoc = item_feature.context_phoc sample.context_feature_1 = torch.Tensor(context_phoc['text']) sample.context_info_1 = imixEasyDict() sample.context_info_1.max_features = torch.tensor( context_phoc['length']) else: context_phoc = self.phoc_processor({'tokens': ocr_tokens}) sample.context_feature_1 = context_phoc['text'] sample.context_info_1 = imixEasyDict() sample.context_info_1.max_features = context_phoc['length'] # OCR order vectors if self.cfg.get('use_order_vectors', False): order_vectors = np.eye(len(sample.ocr_tokens), dtype=np.float32) order_vectors = torch.from_numpy(order_vectors) order_vectors[context['length']:] = 0 sample.order_vectors = order_vectors # OCR bounding box information if 'ocr_normalized_boxes' in sample_info and hasattr( self, 'copy_processor'): # New imdb format: OCR bounding boxes are already pre-computed max_len = self.cfg.answer_processor.config.max_length sample.ocr_bbox_coordinates = self.copy_processor( {'blob': sample_info['ocr_normalized_boxes']})['blob'][:max_len] elif self.use_ocr_info and 'ocr_info' in sample_info: # Old imdb format: OCR bounding boxes are computed on-the-fly # from ocr_info sample.ocr_bbox_coordinates = self.bbox_processor( {'info': sample_info['ocr_info']})['bbox'].coordinates return sample
def clear_scalars(self): self._scalars = imixEasyDict()
def __init__(self): self._scalars = imixEasyDict() self._history = defaultdict(HistoryBuffer) self._smoothing_hints = imixEasyDict()
# answer_table = json.load(open('/home/datasets/mix_data/lxmert/vqa/trainval_label2ans.json')) model_root_path = openchat_path + '/model_pth/' model_vqa_path = dict( lxmert=dict( model_weight=model_root_path + 'lxmert_vqa.pth', answer_table=dataset_root + 'lxmert/vqa/trainval_label2ans.json'), vilbert=dict( model_weight=model_root_path + 'vilbert_vqa.pth', answer_table=dataset_root + 'vilbert/datasets/VQA/cache/trainval_label2ans.json', token=dict(pretrained_model_name_or_path='bert-base-uncased', do_lower_case=True)), oscar=dict( model_weight=model_root_path + 'oscar_vqa.pth', answer_table=dataset_root + 'vilbert/datasets/VQA/cache/trainval_label2ans.json', token=dict( pretrained_model_name_or_path=dataset_root + 'model/oscar/base-vg-labels/ep_107_1192087', do_lower_case=True)), uniter=dict( model_weight=model_root_path + 'uniter_vqa.pth', answer_table=dataset_root + 'vilbert/datasets/VQA/cache/trainval_label2ans.json', token=dict(pretrained_model_name_or_path='bert-base-uncased', do_lower_case=True)), vinvl=dict( model_weight=model_root_path + 'vinvl_vqa.pth', answer_table=dataset_root + 'vilbert/datasets/VQA/cache/trainval_label2ans.json', token=dict( pretrained_model_name_or_path=dataset_root + 'model/oscar/base-vg-labels/ep_107_1192087', do_lower_case=True)), ) model_vqa_path = imixEasyDict(model_vqa_path)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, visual_feats=None, pos=None, obj_labels=None, matched_label=None, ans=None): (lang_output, visn_output), pooled_output = self.bert( input_ids, token_type_ids, attention_mask, visual_feats=(visual_feats, pos), ) lang_prediction_scores, cross_relationship_score = self.cls( lang_output, pooled_output) if self.task_qa: answer_score = self.answer_head(pooled_output) else: # This answer_score would not be used anywhere, # just to keep a constant return function signature. answer_score = pooled_output[0][0] total_loss = 0. loss_fct = CrossEntropyLoss(ignore_index=-1) losses = imixEasyDict() if masked_lm_labels is not None and self.task_mask_lm: masked_lm_loss = loss_fct( lang_prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) total_loss += masked_lm_loss # losses += (masked_lm_loss.detach(),) losses.masked_lm_loss = masked_lm_loss if matched_label is not None and self.task_matched: matched_loss = loss_fct(cross_relationship_score.view(-1, 2), matched_label.view(-1)) total_loss += matched_loss # losses += (matched_loss.detach(),) losses.matched_loss = matched_loss if obj_labels is not None and self.task_obj_predict: loss_fcts = { 'l2': SmoothL1Loss(reduction='none'), 'ce': CrossEntropyLoss(ignore_index=-1, reduction='none') } total_visn_loss = 0. visn_prediction_scores_dict = self.obj_predict_head(visn_output) for key in VISUAL_CONFIG.visual_losses: label, mask_conf = obj_labels[key] output_dim, loss_fct_name, label_shape, weight = VISUAL_CONFIG.visual_loss_config[ key] visn_loss_fct = loss_fcts[loss_fct_name] visn_prediction_scores = visn_prediction_scores_dict[key] visn_loss = visn_loss_fct( visn_prediction_scores.view(-1, output_dim), label.view(*label_shape), ) if visn_loss.dim() > 1: # Regression Losses visn_loss = visn_loss.mean(1) visn_loss = (visn_loss * mask_conf.view(-1)).mean() * weight total_visn_loss += visn_loss # losses += (visn_loss.detach(),) losses[f'{key}_visn_loss'] = visn_loss total_loss += total_visn_loss if ans is not None and self.task_qa: answer_loss = loss_fct(answer_score.view(-1, self.num_answers), ans.view(-1)) # Since this Github version pre-trains with QA loss from the beginning, # I exclude "*2" here to match the effect of QA losses. # Previous: (loss *0) for 6 epochs, (loss *2) for 6 epochs. (Used 10 instead of 6 in EMNLP paper) # Now : (loss *1) for 12 epochs # # * 2 # Multiply by 2 because > half of the data will not have label total_loss += answer_loss # losses += (answer_loss.detach(),) losses.answer_loss = answer_loss return total_loss, losses, answer_score.detach()