Esempio n. 1
0
    def _build_obj_encoding(self):
        # object appearance feature: Faster R-CNN
        self.obj_faster_rcnn_fc7 = ImageEncoder(
            encoder_type='finetune_faster_rcnn_fpn_fc7',
            in_dim=2048,
            weights_file='detectron/fc6/fc7_w.pkl',
            bias_file='detectron/fc6/fc7_b.pkl',
            model_data_dir=self.config["model_data_dir"]
        )
        # apply smaller lr to pretrained Faster R-CNN fc7
        self.finetune_modules.append({
            'module': self.obj_faster_rcnn_fc7,
            'lr_scale': self.config.lr_scale_frcn,
        })
        self.linear_obj_feat_to_mmt_in = nn.Linear(
            self.config.obj.mmt_in_dim, self.mmt_config.hidden_size
        )

        # object location feature: relative bounding box coordinates (4-dim)
        self.linear_obj_bbox_to_mmt_in = nn.Linear(
            4, self.mmt_config.hidden_size
        )

        self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
Esempio n. 2
0
    def _build_ocr_encoding(self):
        self.remove_ocr_fasttext = getattr(self.config.ocr,
                                           'remove_ocr_fasttext', False)
        self.remove_ocr_phoc = getattr(self.config.ocr, 'remove_ocr_phoc',
                                       False)
        self.remove_ocr_frcn = getattr(self.config.ocr, 'remove_ocr_frcn',
                                       False)
        self.remove_ocr_semantics = getattr(self.config.ocr,
                                            'remove_ocr_semantics', False)
        self.remove_ocr_bbox = getattr(self.config.ocr, 'remove_ocr_bbox',
                                       False)

        # OCR appearance feature: Faster R-CNN
        self.ocr_faster_rcnn_fc7 = ImageEncoder(
            encoder_type='finetune_faster_rcnn_fpn_fc7',
            in_dim=2048,
            weights_file='detectron/fc6/fc7_w.pkl',
            bias_file='detectron/fc6/fc7_b.pkl',
            model_data_dir=self.config["model_data_dir"])
        self.finetune_modules.append({
            'module': self.ocr_faster_rcnn_fc7,
            'lr_scale': self.config.lr_scale_frcn,
        })

        self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim,
                                                   self.mmt_config.hidden_size)

        # OCR location feature: relative bounding box coordinates (4-dim)
        self.linear_ocr_bbox_to_mmt_in = nn.Linear(4,
                                                   self.mmt_config.hidden_size)

        self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
Esempio n. 3
0
    def _build_ocr_encoding(self):
        self.ocr_fastext_dim = 300
        self.ocr_phoc_dim = 604
        self.ocr_RCNN_dim = 2048
        self.transformer_cnn_dim = 512
        # OCR appearance feature: Faster R-CNN
        self.ocr_faster_rcnn_fc7 = ImageEncoder(
            encoder_type='finetune_faster_rcnn_fpn_fc7',
            in_dim=2048,
            weights_file='detectron/fc6/fc7_w.pkl',
            bias_file='detectron/fc6/fc7_b.pkl',
            model_data_dir=self.config["model_data_dir"])
        self.finetune_modules.append({
            'module': self.ocr_faster_rcnn_fc7,
            'lr_scale': 0.1,
        })

        # OCR appearance feature: relative Fasttext + PHOC + FasterRCNN
        self.linear_ocr_appear_to_mmt_in = nn.Linear(
            self.ocr_fastext_dim + self.ocr_RCNN_dim + self.ocr_phoc_dim +
            self.transformer_cnn_dim, self.ocr_fastext_dim
            # self.ocr_fastext_dim+self.ocr_RCNN_dim+self.ocr_phoc_dim, self.ocr_fastext_dim
        )
        # OCR location feature: relative bounding box coordinates (4-dim)
        self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.ocr_fastext_dim)
        self.ocr_feat_layer_norm = nn.LayerNorm(self.ocr_fastext_dim)
        self.ocr_bbox_layer_norm = nn.LayerNorm(self.ocr_fastext_dim)
        self.ocr_drop = nn.Dropout(0.1)
Esempio n. 4
0
    def _init_feature_encoders(self, attr):
        feat_encoders = []
        feat_encoders_list_config = self.config[attr + "_feature_encodings"]
        feature_dim = self.config[attr + "_feature_dim"]
        setattr(self, attr + "_feature_dim", feature_dim)

        for feat_encoder in feat_encoders_list_config:
            encoder_type = feat_encoder["type"]
            encoder_kwargs = feat_encoder["params"]
            encoder_kwargs["model_data_dir"] = self.config["model_data_dir"]

            feat_model = ImageEncoder(encoder_type, feature_dim, **encoder_kwargs)

            feat_encoders.append(feat_model)
            setattr(self, attr + "_feature_dim", feat_model.out_dim)

        setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
Esempio n. 5
0
File: cnmt.py Progetto: wzk1015/CNMT
    def _build_ocr_encoding(self):
        # OCR appearance feature: Faster R-CNN
        self.ocr_faster_rcnn_fc7 = ImageEncoder(
            encoder_type='finetune_faster_rcnn_fpn_fc7',
            in_dim=2048,
            weights_file='detectron/fc6/fc7_w.pkl',
            bias_file='detectron/fc6/fc7_b.pkl',
            model_data_dir=self.config["model_data_dir"])
        self.finetune_modules.append({
            'module': self.ocr_faster_rcnn_fc7,
            'lr_scale': 0.1,
        })

        self.linear_ocr_feat_to_mmt_in = nn.Linear(3002, 768)

        # OCR location feature: relative bounding box coordinates (4-dim)
        self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, 768)

        self.linear_ocr_conf_to_mmt_in = nn.Linear(1, 768)

        self.ocr_feat_layer_norm = BertLayerNorm(768)
        self.ocr_bbox_layer_norm = BertLayerNorm(768)
        self.ocr_conf_layer_norm = BertLayerNorm(768)
        self.ocr_drop = nn.Dropout(0.1)