def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr(self.config.ocr, "remove_ocr_fasttext", False) self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc", False) self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn", False) self.remove_ocr_semantics = getattr(self.config.ocr, "remove_ocr_semantics", False) self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox", False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageFeatureEncoder( encoder_type="finetune_faster_rcnn_fpn_fc7", in_dim=2048, weights_file="models/detectron.defaults/fc7_w.pkl", bias_file="models/detectron.defaults/fc7_b.pkl", model_data_dir=self.config.model_data_dir, ) self.finetune_modules.append({ "module": self.ocr_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn }) self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def build_image_encoder(config, direct_features=False, **kwargs): from mmf.modules.encoders import ImageFeatureEncoder, ImageEncoder if direct_features: module = ImageFeatureEncoder(config.type, **config.params) else: module = ImageEncoder(config) return module.module
def _init_feature_encoders(self, attr: str): feat_encoder = self.config[attr + "_feature_encodings"] feature_dim = self.config[attr + "_feature_dim"] setattr(self, attr + "_feature_dim", feature_dim) encoder_type = feat_encoder.type encoder_kwargs = copy.deepcopy(feat_encoder.params) encoder_kwargs.model_data_dir = self.config.model_data_dir encoder_kwargs.cond_features = self.text_embeddings_out_dim feat_model = ImageFeatureEncoder(encoder_type, feature_dim, **encoder_kwargs) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_encoders", feat_model)
def _init_feature_projectors(self, attr): feature_projectors = [] feat_encoders_list_config = self.config[attr + "_feature_projections"] feat_dim = getattr(self, attr + "_feature_dim") for feat_encoder in feat_encoders_list_config: encoder_type = feat_encoder.type encoder_kwargs = feat_encoder.params feat_model = ImageFeatureEncoder(encoder_type, feat_dim, **encoder_kwargs) feature_projectors.append(feat_model) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_projectors", nn.ModuleList(feature_projectors))
def _init_feature_encoders(self, attr): feat_encoders = [] feat_encoders_list_config = self.config[attr + "_feature_encodings"] feature_dim = self.config[attr + "_feature_dim"] setattr(self, attr + "_feature_dim", feature_dim) for feat_encoder in feat_encoders_list_config: encoder_type = feat_encoder.type encoder_kwargs = copy.deepcopy(feat_encoder.params) encoder_kwargs.model_data_dir = self.config.model_data_dir feat_model = ImageFeatureEncoder(encoder_type, feature_dim, **encoder_kwargs) feat_encoders.append(feat_model) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
def _build_obj_encoding(self): # object appearance feature: Faster R-CNN self.obj_faster_rcnn_fc7 = ImageFeatureEncoder( encoder_type="finetune_faster_rcnn_fpn_fc7", in_dim=2048, weights_file="models/detectron.defaults/fc7_w.pkl", bias_file="models/detectron.defaults/fc7_b.pkl", model_data_dir=self.config.model_data_dir, ) # apply smaller lr to pretrained Faster R-CNN fc7 self.finetune_modules.append( {"module": self.obj_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn} ) self.linear_obj_feat_to_mmt_in = nn.Linear( self.config.obj.mmt_in_dim, self.mmt_config.hidden_size ) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)