Python BertConfigの例、transformers.models.bert.modeling_bert.BertConfig Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bert_attention_test.py プロジェクト: Tencent/TurboTransformers

        def init_data(self, use_cuda):
            test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)
            self.cfg.output_attentions = True
            torch_attention = BertAttention(self.cfg)
            torch_attention.eval()
            if use_cuda:
                torch_attention.to(test_device)

            # Get FT Attention
            turbo_attention = turbo_transformers.BertAttention.from_torch(
                torch_attention)

            turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch(
                torch_attention, is_trans_weight=False)

            hidden_size = self.cfg.hidden_size
            input_tensor = torch.rand(size=(batch_size, seq_length,
                                            hidden_size),
                                      dtype=torch.float32,
                                      device=test_device)
            attention_mask = torch.ones((batch_size, seq_length),
                                        dtype=torch.float32,
                                        device=test_device)
            attention_mask = attention_mask[:, None, None, :]
            attention_mask = (1.0 - attention_mask) * -10000.0
            return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask

コード例 #2

0

ファイルを表示

    def _build_txt_encoding(self):
        TEXT_BERT_HIDDEN_SIZE = 768

        self.text_bert_config = BertConfig(**self.config.text_bert)
        if self.config.text_bert_init_from_bert_base:
            self.text_bert = TextBert.from_pretrained(
                "bert-base-uncased", config=self.text_bert_config)
            # Use a smaller learning rate on text bert when initializing
            # from BERT_BASE
            self.finetune_modules.append({
                "module":
                self.text_bert,
                "lr_scale":
                self.config.lr_scale_text_bert
            })
        else:
            logger.info("NOT initializing text_bert from BERT_BASE")
            self.text_bert = TextBert(self.text_bert_config)

        # if the text bert output dimension doesn't match the
        # multimodal transformer (mmt) hidden dimension,
        # add a linear projection layer between the two
        if self.mmt_config.hidden_size != TEXT_BERT_HIDDEN_SIZE:
            logger.info(
                f"Projecting text_bert output to {self.mmt_config.hidden_size} dim"
            )

            self.text_bert_out_linear = nn.Linear(TEXT_BERT_HIDDEN_SIZE,
                                                  self.mmt_config.hidden_size)
        else:
            self.text_bert_out_linear = nn.Identity()

コード例 #3

0

ファイルを表示

ファイル: bert_encoder_test.py プロジェクト: Tencent/TurboTransformers

    def init_data(self, use_cuda) -> None:
        test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')
        if not use_cuda:
            torch.set_num_threads(1)

        torch.set_grad_enabled(False)
        self.cfg = BertConfig()

        self.torch_encoder_layer = BertEncoder(self.cfg)
        self.torch_encoder_layer.eval()

        if use_cuda:
            self.torch_encoder_layer.to(test_device)

        self.batch_size = 1
        self.seq_length = 40
        self.hidden_size = self.cfg.hidden_size
        self.input_tensor = torch.rand(size=(self.batch_size, self.seq_length,
                                             self.hidden_size),
                                       dtype=torch.float32,
                                       device=test_device)

        self.attention_mask = torch.ones((self.batch_size, self.seq_length),
                                         dtype=torch.float32,
                                         device=test_device)
        self.attention_mask = self.attention_mask[:, None, None, :]
        self.attention_mask = (1.0 - self.attention_mask) * -10000.0

        self.turbo_bert_encoder = turbo_transformers.BertEncoder.from_torch(
            self.torch_encoder_layer)

コード例 #4

0

ファイルを表示

ファイル: bert_output_test.py プロジェクト: Tencent/TurboTransformers

        def init_data(self, use_cuda) -> None:
            test_device = torch.device('cuda:0') if use_cuda else \
                    torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig()
            self.intermediate_size = self.cfg.intermediate_size  # 3072;
            self.hidden_size = self.cfg.hidden_size  # 768
            self.torch_bertout = BertOutput(self.cfg)
            self.torch_bertout.eval()
            if use_cuda:
                self.torch_bertout.to(test_device)

            self.turbo_bertout = turbo_transformers.BertOutput.from_torch(
                self.torch_bertout)

            self.intermediate_output = torch.rand(
                size=(batch_size, seq_length, self.intermediate_size),
                dtype=torch.float32,
                device=test_device)
            self.attention_output = torch.rand(size=(batch_size, seq_length,
                                                     self.hidden_size),
                                               dtype=torch.float32,
                                               device=test_device)

コード例 #5

0

ファイルを表示

ファイル: bert_embedding_test.py プロジェクト: Tencent/TurboTransformers

        def init_data(self, use_cuda: bool):
            test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')

            torch.set_grad_enabled(False)
            cfg = BertConfig()
            self.torch_embedding = BertEmbeddings(cfg)

            self.torch_embedding.eval()

            if use_cuda:
                self.torch_embedding.to(test_device)

            self.turbo_embedding = turbo_transformers.BertEmbeddings.from_torch(
                self.torch_embedding)

            input_ids = torch.randint(low=0,
                                      high=cfg.vocab_size - 1,
                                      size=(batch_size, seq_length),
                                      dtype=torch.long,
                                      device=test_device)
            position_ids = torch.arange(seq_length,
                                        dtype=torch.long,
                                        device=input_ids.device)

            position_ids = position_ids.repeat(batch_size, 1)
            token_type_ids = torch.zeros_like(input_ids, dtype=torch.long)

            return input_ids, position_ids, token_type_ids

コード例 #6

0

ファイルを表示

 def __init__(self, model_name: str) -> None:
     super().__init__()
     config = BertConfig.from_pretrained(model_name)
     self.input_dim = config.hidden_size
     self.output_dim = config.vocab_size
     # TODO(mattg): It's possible that we could use some kind of cache like we have in
     # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel.  That way, we
     # would only load the BERT weights once.  Though, it's not clear how to do that here, as we
     # need to load `BertForMaskedLM`, not just `BertModel`...
     bert_model = BertForMaskedLM.from_pretrained(model_name)
     self.bert_lm_head = bert_model.cls

コード例 #7

0

ファイルを表示

    def _build_word_embedding(self):
        self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name)
        if self.config.pretrained_bert:
            bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name)
            self.word_embedding = bert_model.bert.embeddings
            self.pooler = bert_model.bert.pooler
            self.pooler.apply(self.init_weights)

        else:
            self.pooler = BertPooler(self.bert_config)
            self.word_embedding = BertEmbeddings(self.bert_config)

コード例 #8

0

ファイルを表示

ファイル: visual_bert.py プロジェクト: naykun/mmf

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.output_attentions = self.config.output_attentions
        self.output_hidden_states = self.config.output_hidden_states

        # If bert_model_name is not specified, you will need to specify
        # all of the required parameters for BERTConfig and a pretrained
        # model won't be loaded
        self.bert_model_name = getattr(self.config, "bert_model_name", None)
        self.bert_config = BertConfig.from_dict(
            OmegaConf.to_container(self.config, resolve=True))
        if self.bert_model_name is None:
            self.bert = VisualBERTBase(
                self.bert_config,
                visual_embedding_dim=self.config.visual_embedding_dim,
                embedding_strategy=self.config.embedding_strategy,
                bypass_transformer=self.config.bypass_transformer,
                output_attentions=self.config.output_attentions,
                output_hidden_states=self.config.output_hidden_states,
            )
        else:
            self.bert = VisualBERTBase.from_pretrained(
                self.config.bert_model_name,
                config=self.bert_config,
                cache_dir=os.path.join(get_mmf_cache_dir(),
                                       "distributed_{}".format(-1)),
                visual_embedding_dim=self.config.visual_embedding_dim,
                embedding_strategy=self.config.embedding_strategy,
                bypass_transformer=self.config.bypass_transformer,
                output_attentions=self.config.output_attentions,
                output_hidden_states=self.config.output_hidden_states,
            )

        self.vocab_size = self.bert.config.vocab_size

        # TODO: Once omegaconf fixes int keys issue, bring this back
        # See https://github.com/omry/omegaconf/issues/149
        # with omegaconf.open_dict(self.config):
        #     # Add bert config such as hidden_state to our main config
        #     self.config.update(self.bert.config.to_dict())
        if self.bert_model_name is None:
            bert_masked_lm = BertForPreTraining(self.bert.config)
        else:
            bert_masked_lm = BertForPreTraining.from_pretrained(
                self.config.bert_model_name,
                config=self.bert.config,
                cache_dir=os.path.join(get_mmf_cache_dir(),
                                       "distributed_{}".format(-1)),
            )
        self.cls = deepcopy(bert_masked_lm.cls)
        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
        self.init_weights()

コード例 #9

0

ファイルを表示

ファイル: lca_glove.py プロジェクト: mohit-sh/LC-ABSA

 def __init__(self, embedding_matrix, opt):
     super(LCA_GLOVE, self).__init__()
     # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads
     self.config = BertConfig.from_json_file("modules/utils/bert_config.json")
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
     self.lc_embed = nn.Embedding(2, opt.embed_dim)
     self.global_encoder1 = SelfAttention(self.config, opt)
     self.local_encoder1 = SelfAttention(self.config, opt)
     self.local_encoder2 = SelfAttention(self.config, opt)
     self.mha = SelfAttention(self.config, opt)
     self.pool = BertPooler(self.config)
     self.dropout = nn.Dropout(opt.dropout)
     self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim)
     self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
     self.classifier = nn.Linear(opt.embed_dim, 2)

コード例 #10

0

ファイルを表示

ファイル: bert_model_test.py プロジェクト: Tencent/TurboTransformers

    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")

コード例 #11

0

ファイルを表示

ファイル: bert_pooler_test.py プロジェクト: Tencent/TurboTransformers

        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                    torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig()

            self.torch_pooler = BertPooler(self.cfg)
            if torch.cuda.is_available():
                self.torch_pooler.to(self.test_device)
            self.torch_pooler.eval()

            self.turbo_pooler = turbo_transformers.BertPooler.from_torch(
                self.torch_pooler)

コード例 #12

0

ファイルを表示

ファイル: visual_bert.py プロジェクト: naykun/mmf

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.output_attentions = self.config.output_attentions
        self.output_hidden_states = self.config.output_hidden_states
        self.pooler_strategy = self.config.get("pooler_strategy", "default")

        # If bert_model_name is not specified, you will need to specify
        # all of the required parameters for BERTConfig and a pretrained
        # model won't be loaded
        self.bert_model_name = getattr(self.config, "bert_model_name", None)
        self.bert_config = BertConfig.from_dict(
            OmegaConf.to_container(self.config, resolve=True))
        if self.bert_model_name is None:
            self.bert = VisualBERTBase(
                self.bert_config,
                visual_embedding_dim=self.config.visual_embedding_dim,
                embedding_strategy=self.config.embedding_strategy,
                bypass_transformer=self.config.bypass_transformer,
                output_attentions=self.config.output_attentions,
                output_hidden_states=self.config.output_hidden_states,
            )
        else:
            self.bert = VisualBERTBase.from_pretrained(
                self.config.bert_model_name,
                config=self.bert_config,
                cache_dir=os.path.join(get_mmf_cache_dir(),
                                       "distributed_{}".format(-1)),
                visual_embedding_dim=self.config.visual_embedding_dim,
                embedding_strategy=self.config.embedding_strategy,
                bypass_transformer=self.config.bypass_transformer,
                output_attentions=self.config.output_attentions,
                output_hidden_states=self.config.output_hidden_states,
            )

        self.training_head_type = self.config.training_head_type
        self.num_labels = self.config.num_labels
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        if self.config.training_head_type == "nlvr2":
            self.bert.config.hidden_size *= 2
        self.classifier = nn.Sequential(
            BertPredictionHeadTransform(self.bert.config),
            nn.Linear(self.bert.config.hidden_size, self.config.num_labels),
        )

        self.init_weights()

コード例 #13

0

ファイルを表示

    def setup_method(self):

        self.monkeypatch = MonkeyPatch()
        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json"
        vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt"
        config = BertConfig.from_json_file(config_path)
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
        self.monkeypatch.setattr(
            BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path)
        )

        super().setup_method()
        self.set_up_model(
            FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet",
            FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012",
        )

コード例 #14

0

ファイルを表示

ファイル: bert_backbone.py プロジェクト: lgessler/embur

    def __init__(
        self,
        vocab: Vocabulary,
        embedding_dim: int,
        feedforward_dim: int,
        num_layers: int,
        num_attention_heads: int,
        position_embedding_dim: int,
        tokenizer_path: str,
        position_embedding_type: str = "absolute",
        activation: str = "gelu",
        hidden_dropout: float = 0.1,
    ) -> None:
        super().__init__()
        # TODO:
        # - Need to apply corrections in pretrained_transformer_mismatched_embedder

        tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        vocab.add_transformer_vocab(tokenizer, "tokens")
        # "tokens" is padded by default--undo that
        del vocab._token_to_index["tokens"]["@@PADDING@@"]
        del vocab._token_to_index["tokens"]["@@UNKNOWN@@"]
        assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"])

        cfg = BertConfig(
            vocab_size=vocab.get_vocab_size("tokens"),
            hidden_size=embedding_dim,
            num_hidden_layers=num_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=feedforward_dim,
            hidden_act=activation,
            hidden_dropout_prob=hidden_dropout,
            max_position_embeddings=position_embedding_dim,
            position_embedding_type=position_embedding_type,
            use_cache=True,
        )
        self.cfg = cfg
        self._vocab = vocab
        self._namespace = "tokens"
        self.bert = BertModel(cfg)
        self.masking_collator = DataCollatorForWholeWordMask(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        )

コード例 #15

0

ファイルを表示

ファイル: bert_smart_batch_test.py プロジェクト: Tencent/TurboTransformers

        def init_bert_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch(
                self.torch_model)

コード例 #16

0

ファイルを表示

 def __init__(self, embedding_matrix, opt):
     super(LCF_GLOVE, self).__init__()
     self.config = BertConfig.from_json_file(
         "modules/utils/bert_config.json")
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(
         torch.tensor(embedding_matrix, dtype=torch.float))
     self.mha_global = SelfAttention(self.config, opt)
     self.mha_local = SelfAttention(self.config, opt)
     self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim,
                                               dropout=self.opt.dropout)
     self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim,
                                              dropout=self.opt.dropout)
     self.mha_local_SA = SelfAttention(self.config, opt)
     self.mha_global_SA = SelfAttention(self.config, opt)
     self.pool = BertPooler(self.config)
     self.dropout = nn.Dropout(opt.dropout)
     self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim)
     self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)

コード例 #17

0

ファイルを表示

ファイル: bert_smart_batch_test.py プロジェクト: Tencent/TurboTransformers

        def init_attn_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            # torch model is from ONMT
            # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size)
            self.torch_model = BertAttention(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt(
            #     self.torch_model)
            self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch(
                self.torch_model)

コード例 #18

0

ファイルを表示

 def __init__(self, config):
     super().__init__(config)
     self.mmt_config = BertConfig(**self.config.mmt)
     self._datasets = registry.get("config").datasets.split(",")

コード例 #19

0

ファイルを表示

ファイル: vilt_module.py プロジェクト: Jxu-Thu/virtex

    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters()

        bert_config = BertConfig(
            vocab_size=config["vocab_size"],
            hidden_size=config["hidden_size"],
            num_hidden_layers=config["num_layers"],
            num_attention_heads=config["num_heads"],
            intermediate_size=config["hidden_size"] * config["mlp_ratio"],
            max_position_embeddings=config["max_text_len"],
            hidden_dropout_prob=config["drop_rate"],
            attention_probs_dropout_prob=config["drop_rate"],
        )
        self.tempeture_max_OT = config['tempeture_max_OT']
        self.text_embeddings = BertEmbeddings(bert_config)
        self.text_embeddings.apply(objectives.init_weights)

        self.token_type_embeddings = nn.Embedding(2, config["hidden_size"])
        self.token_type_embeddings.apply(objectives.init_weights)

        import vilt.modules.vision_transformer as vit

        if self.hparams.config["load_path"] == "":
            self.transformer = getattr(vit, self.hparams.config["vit"])(
                pretrained=config["pretrained_flag"], config=self.hparams.config)
        else:
            self.transformer = getattr(vit, self.hparams.config["vit"])(
                pretrained=False, config=self.hparams.config
            )

        self.pooler = heads.Pooler(config["hidden_size"])
        self.pooler.apply(objectives.init_weights)

        if config["loss_names"]["mlm"] > 0:
            self.mlm_score = heads.MLMHead(bert_config)
            self.mlm_score.apply(objectives.init_weights)

        if config["loss_names"]["itm"] > 0:
            self.itm_score = heads.ITMHead(config["hidden_size"])
            self.itm_score.apply(objectives.init_weights)

        if config["loss_names"]["mpp"] > 0:
            self.mpp_score = heads.MPPHead(bert_config)
            self.mpp_score.apply(objectives.init_weights)

        # ===================== Downstream ===================== #
        if (
            self.hparams.config["load_path"] != ""
            and not self.hparams.config["test_only"]
        ):
            ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
            state_dict = ckpt["state_dict"]
            self.load_state_dict(state_dict, strict=False)
            print(f'Loading checkpoint from {self.hparams.config["load_path"]}')

        hs = self.hparams.config["hidden_size"]

        if self.hparams.config["loss_names"]["vqa"] > 0:
            vs = self.hparams.config["vqav2_label_size"]
            self.vqa_classifier = nn.Sequential(
                nn.Linear(hs, hs * 2),
                nn.LayerNorm(hs * 2),
                nn.GELU(),
                nn.Linear(hs * 2, vs),
            )
            self.vqa_classifier.apply(objectives.init_weights)

        if self.hparams.config["loss_names"]["nlvr2"] > 0:
            self.nlvr2_classifier = nn.Sequential(
                nn.Linear(hs * 2, hs * 2),
                nn.LayerNorm(hs * 2),
                nn.GELU(),
                nn.Linear(hs * 2, 2),
            )
            self.nlvr2_classifier.apply(objectives.init_weights)
            emb_data = self.token_type_embeddings.weight.data
            self.token_type_embeddings = nn.Embedding(3, hs)
            self.token_type_embeddings.apply(objectives.init_weights)
            self.token_type_embeddings.weight.data[0, :] = emb_data[0, :]
            self.token_type_embeddings.weight.data[1, :] = emb_data[1, :]
            self.token_type_embeddings.weight.data[2, :] = emb_data[1, :]

        if self.hparams.config["loss_names"]["irtr"] > 0:
            self.rank_output = nn.Linear(hs, 1)
            self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :]
            self.rank_output.bias.data = self.itm_score.fc.bias.data[1:]
            self.margin = 0.2
            for p in self.itm_score.parameters():
                p.requires_grad = False

        vilt_utils.set_metrics(self)
        self.current_tasks = list()

        # ===================== load downstream (test_only) ======================

        if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]:
            ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
            state_dict = ckpt["state_dict"]
            self.load_state_dict(state_dict, strict=False)
            print(f'Loading checkpoint from {self.hparams.config["load_path"]}')

コード例 #20

0

ファイルを表示

params = {
    'tokenizer_config': {
        'type': 'bert-base-uncased',
        'params': {
            'do_lower_case': True
        }
    },
    'mask_probability': 0,
    'max_seq_length': 128
}
mmf_tok = MMFTokenizer(OmegaConf.create(params))
mmf_tok._tokenizer = BertTokenizer(vocab_file="vocabulary.txt")

config = BertConfig.from_pretrained('bert-large-uncased',
                                    num_labels=2,
                                    vocab_size=len(vocabulary),
                                    num_hidden_layers=3)
net = VisualBertModel(config, visual_embedding_dim=2048).cuda()

out_txt = mmf_tok({'text': report})

input_ids = torch.tensor(out_txt['input_ids']).unsqueeze(0)
input_mask = torch.tensor(out_txt['input_mask']).unsqueeze(0)
img = torch.zeros(1, 14, 2048)

out = net(input_ids=input_ids.cuda(),
          text_mask=input_mask.cuda(),
          visual_embeddings=img.cuda())

print(net.config.add_cross_attention)
print(out.keys())