def _build_obj_encoding(self): # object appearance feature: Faster R-CNN self.obj_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"] ) # apply smaller lr to pretrained Faster R-CNN fc7 self.finetune_modules.append({ 'module': self.obj_faster_rcnn_fc7, 'lr_scale': self.config.lr_scale_frcn, }) self.linear_obj_feat_to_mmt_in = nn.Linear( self.config.obj.mmt_in_dim, self.mmt_config.hidden_size ) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear( 4, self.mmt_config.hidden_size ) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr(self.config.ocr, 'remove_ocr_fasttext', False) self.remove_ocr_phoc = getattr(self.config.ocr, 'remove_ocr_phoc', False) self.remove_ocr_frcn = getattr(self.config.ocr, 'remove_ocr_frcn', False) self.remove_ocr_semantics = getattr(self.config.ocr, 'remove_ocr_semantics', False) self.remove_ocr_bbox = getattr(self.config.ocr, 'remove_ocr_bbox', False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"]) self.finetune_modules.append({ 'module': self.ocr_faster_rcnn_fc7, 'lr_scale': self.config.lr_scale_frcn, }) self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def _build_ocr_encoding(self): self.ocr_fastext_dim = 300 self.ocr_phoc_dim = 604 self.ocr_RCNN_dim = 2048 self.transformer_cnn_dim = 512 # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"]) self.finetune_modules.append({ 'module': self.ocr_faster_rcnn_fc7, 'lr_scale': 0.1, }) # OCR appearance feature: relative Fasttext + PHOC + FasterRCNN self.linear_ocr_appear_to_mmt_in = nn.Linear( self.ocr_fastext_dim + self.ocr_RCNN_dim + self.ocr_phoc_dim + self.transformer_cnn_dim, self.ocr_fastext_dim # self.ocr_fastext_dim+self.ocr_RCNN_dim+self.ocr_phoc_dim, self.ocr_fastext_dim ) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.ocr_fastext_dim) self.ocr_feat_layer_norm = nn.LayerNorm(self.ocr_fastext_dim) self.ocr_bbox_layer_norm = nn.LayerNorm(self.ocr_fastext_dim) self.ocr_drop = nn.Dropout(0.1)
def _init_feature_encoders(self, attr): feat_encoders = [] feat_encoders_list_config = self.config[attr + "_feature_encodings"] feature_dim = self.config[attr + "_feature_dim"] setattr(self, attr + "_feature_dim", feature_dim) for feat_encoder in feat_encoders_list_config: encoder_type = feat_encoder["type"] encoder_kwargs = feat_encoder["params"] encoder_kwargs["model_data_dir"] = self.config["model_data_dir"] feat_model = ImageEncoder(encoder_type, feature_dim, **encoder_kwargs) feat_encoders.append(feat_model) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
def _build_ocr_encoding(self): # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"]) self.finetune_modules.append({ 'module': self.ocr_faster_rcnn_fc7, 'lr_scale': 0.1, }) self.linear_ocr_feat_to_mmt_in = nn.Linear(3002, 768) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, 768) self.linear_ocr_conf_to_mmt_in = nn.Linear(1, 768) self.ocr_feat_layer_norm = BertLayerNorm(768) self.ocr_bbox_layer_norm = BertLayerNorm(768) self.ocr_conf_layer_norm = BertLayerNorm(768) self.ocr_drop = nn.Dropout(0.1)