Esempio n. 1
0
    def _build_ocr_encoding(self):
        self.remove_ocr_fasttext = getattr(
            self.config.ocr, "remove_ocr_fasttext", False
        )
        self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc", False)
        self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn", False)
        self.remove_ocr_semantics = getattr(
            self.config.ocr, "remove_ocr_semantics", False
        )
        self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox", False)

        # OCR appearance feature: Faster R-CNN
        self.ocr_faster_rcnn_fc7 = build_image_encoder(
            self._build_encoder_config(), direct_features=True
        )
        self.finetune_modules.append(
            {"module": self.ocr_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn}
        )

        self.linear_ocr_feat_to_mmt_in = nn.Linear(
            self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size
        )

        # OCR location feature: relative bounding box coordinates (4-dim)
        self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size)

        self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
Esempio n. 2
0
    def build(self):
        self._is_direct_features_input = self.config.direct_features_input
        # Encoders
        self.text_encoder = build_text_encoder(self.config.text_encoder)
        self.image_encoder = build_image_encoder(
            self.config.image_encoder, self._is_direct_features_input
        )

        # Projectors
        image_proj_config = deepcopy(self.config.image_projection)
        self.image_proj = build_classifier_layer(image_proj_config)

        text_proj_config = deepcopy(self.config.text_projection)
        self.text_proj = build_classifier_layer(text_proj_config)

        # Aggregators
        self.image_pool = AttnPool1d(self.config.final_hidden_size, 1)
        self.text_pool = AttnPool1d(self.config.final_hidden_size, 1)

        # Shared transformer
        transformer_layer = torch.nn.TransformerEncoderLayer(
            self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu"
        )
        self.shared_transformer = torch.nn.TransformerEncoder(
            transformer_layer, num_layers=2
        )

        # Position embeddings - Image
        self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
Esempio n. 3
0
 def build(self):
     self.vision_module = build_image_encoder(self.config.image_encoder)
     self.classifier = build_classifier_layer(self.config.classifier)
     self.language_module = ProjectionEmbedding(
         **self.config.text_encoder.params)
     self.dropout = torch.nn.Dropout(self.config.dropout)
     self.fusion = torch.nn.Linear(**self.config.fusion.params)
Esempio n. 4
0
    def build(self):

        # to be further set
        # breakpoint()
        self.image_feature_module = build_image_encoder(
            self.config.image_feature_processor, direct_features=True
        )
        if self.config.concate_trace:
            self.trace_feature_module = build_encoder(self.config.trace_feature_encoder)

        if self.config.base_model_name == "bert-base-uncased":
            self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
                "bert-base-uncased", "bert-base-uncased"
            )
        elif self.config.base_model_name == "2layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.max_position_embeddings = 1090
            config_encoder.num_hidden_layers = 2
            config_decoder.num_hidden_layers = 2
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder
            )
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        elif self.config.base_model_name == "3layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 3
            config_decoder.num_hidden_layers = 3
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder
            )
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        if self.config.loop_contrastive:
            self.trace_caption_contrastive = TraceCaptionContrastiveModel(
                self.config.tc_contrastive_aggregate_method
            )
        if (
            hasattr(self.config, "pretrans_attention")
            and self.config.pretrans_attention
        ):

            # import ipdb; ipdb.set_trace()
            tempconf = self.encoderdecoder.config.encoder
            num_heads = tempconf.num_attention_heads
            num_layers = tempconf.num_hidden_layers
            self.attention_trans = AttentionTransform(num_layers, num_heads, 100)
        self.BOS_ID = 101
        self.vae = OpenAIDiscreteVAE()
        image_code_dim = 768
        image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers)
        self.image_seq_len = image_fmap_size ** 2
        self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim)
        self.image_pos_emb = AxialPositionalEmbedding(
            image_code_dim, axial_shape=(image_fmap_size, image_fmap_size)
        )
Esempio n. 5
0
    def _init_feature_encoders(self, attr: str):
        feat_encoder = self.config[attr + "_feature_encodings"]
        feature_dim = self.config[attr + "_feature_dim"]
        setattr(self, attr + "_feature_dim", feature_dim)

        feat_encoder_config = copy.deepcopy(feat_encoder)
        feat_encoder_config.params.model_data_dir = self.config.model_data_dir
        feat_encoder_config.params.in_dim = feature_dim
        feat_model = build_image_encoder(feat_encoder_config,
                                         direct_features=True)

        setattr(self, attr + "_feature_dim", feat_model.out_dim)
        setattr(self, attr + "_feature_encoders", feat_model)
Esempio n. 6
0
    def _init_feature_encoders(self, attr):
        feat_encoders = []
        feat_encoders_list_config = self.config[attr + "_feature_encodings"]
        feature_dim = self.config[attr + "_feature_dim"]
        setattr(self, attr + "_feature_dim", feature_dim)

        for feat_encoder in feat_encoders_list_config:
            feat_encoder_config = copy.deepcopy(feat_encoder)
            feat_encoder_config.params.model_data_dir = self.config.model_data_dir
            feat_model = build_image_encoder(feat_encoder_config,
                                             direct_features=True)
            feat_encoders.append(feat_model)
            setattr(self, attr + "_feature_dim", feat_model.out_dim)

        setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
Esempio n. 7
0
    def build(self):
        """
       Config's image_encoder attribute will used to build an MMF image
       encoder. This config in yaml will look like:
 
       # "type" parameter specifies the type of encoder we are using here.
       # In this particular case, we are using resnet152
       type: resnet152
    
       # Parameters are passed to underlying encoder class by
       # build_image_encoder
       params:
         # Specifies whether to use a pretrained version
         pretrained: true
         # Pooling type, use max to use AdaptiveMaxPool2D
         pool_type: avg
    
         # Number of output features from the encoder, -1 for original
         # otherwise, supports between 1 to 9
         num_output_features: 1
       """
        self.vision_module = build_image_encoder(self.config.image_encoder)
        """
       For classifer, configuration would look like:
       # Specifies the type of the classifier, in this case mlp
       type: mlp
       # Parameter to the classifier passed through build_classifier_layer
       params:
         # Dimension of the tensor coming into the classifier
         in_dim: 512
         # Dimension of the tensor going out of the classifier
         out_dim: 2
         # Number of MLP layers in the classifier
         num_layers: 0
       """
        self.classifier = build_classifier_layer(self.config.classifier)

        # ProjectionEmbeddings takes in params directly as it is module
        # So, pass in kwargs, which are in_dim, out_dim and module
        # whose value would be "linear" as we want linear layer
        self.language_module = ProjectionEmbedding(
            **self.config.text_encoder.params)
        # Dropout value will come from config now
        self.dropout = torch.nn.Dropout(self.config.dropout)
        # Same as Projection Embedding, fusion's layer params (which are param
        # for linear layer) will come from config now
        self.fusion = torch.nn.Linear(**self.config.fusion.params)
        self.relu = torch.nn.ReLU()
Esempio n. 8
0
    def _init_feature_projectors(self, attr):
        feature_projectors = []
        feat_encoders_list_config = self.config[attr + "_feature_projections"]
        feat_dim = getattr(self, attr + "_feature_dim")

        for feat_encoder in feat_encoders_list_config:
            feat_encoder_config = copy.deepcopy(feat_encoder)
            feat_encoder_config.params.in_dim = feat_dim
            feat_model = build_image_encoder(feat_encoder_config,
                                             direct_features=True)

            feature_projectors.append(feat_model)
            setattr(self, attr + "_feature_dim", feat_model.out_dim)

        setattr(self, attr + "_feature_projectors",
                nn.ModuleList(feature_projectors))
Esempio n. 9
0
    def _build_obj_encoding(self):
        # object appearance feature: Faster R-CNN
        self.obj_faster_rcnn_fc7 = build_image_encoder(
            self._build_encoder_config(), direct_features=True
        )
        # apply smaller lr to pretrained Faster R-CNN fc7
        self.finetune_modules.append(
            {"module": self.obj_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn}
        )
        self.linear_obj_feat_to_mmt_in = nn.Linear(
            self.config.obj.mmt_in_dim, self.mmt_config.hidden_size
        )

        # object location feature: relative bounding box coordinates (4-dim)
        self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size)

        self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
Esempio n. 10
0
    def build(self):

        # to be further set
        # breakpoint()
        self.image_feature_module = build_image_encoder(
            self.config.image_feature_processor, direct_features=True)
        if self.config.concate_trace:
            self.trace_feature_module = build_encoder(
                self.config.trace_feature_encoder)

        if self.config.base_model_name == "bert-base-uncased":
            self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
                "bert-base-uncased", "bert-base-uncased")
        elif self.config.base_model_name == "2layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 2
            config_decoder.num_hidden_layers = 2
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        elif self.config.base_model_name == "3layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 3
            config_decoder.num_hidden_layers = 3
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        if self.config.loop_contrastive:
            self.trace_caption_contrastive = TraceCaptionContrastiveModel(
                self.config.tc_contrastive_aggregate_method)
        if (hasattr(self.config, "pretrans_attention")
                and self.config.pretrans_attention):

            # import ipdb; ipdb.set_trace()
            tempconf = self.encoderdecoder.config.encoder
            num_heads = tempconf.num_attention_heads
            num_layers = tempconf.num_hidden_layers
            self.attention_trans = AttentionTransform(num_layers, num_heads,
                                                      100)
        self.BOS_ID = 101
Esempio n. 11
0
 def _build_modal_encoder(self, config):
     return build_image_encoder(
         config, direct_features=self._is_direct_features_input)