def setUp(self):
        super().setUp()
        self.monkeypatch = MonkeyPatch()

        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
Ejemplo n.º 2
0
    def setUp(self):
        super().setUp()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        self.token_indexer = PretrainedBertIndexer(str(vocab_path))

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        self.bert_model = BertModel(config)
        self.token_embedder = BertEmbedder(self.bert_model)
Ejemplo n.º 3
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Ejemplo n.º 4
0
    def test_sliding_window(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["input_ids"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

        bert_vectors = token_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"],
                                      token_type_ids=tokens["token_type_ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
Ejemplo n.º 5
0
 def __init__(self, embedding_matrix, opt):
     super(LCA_GLOVE, self).__init__()
     # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads
     self.config = BertConfig.from_json_file("utils/bert_config.json")
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(
         torch.tensor(embedding_matrix, dtype=torch.float))
     self.lc_embed = nn.Embedding(2, opt.embed_dim)
     self.global_encoder1 = SelfAttention(self.config, opt)
     self.local_encoder1 = SelfAttention(self.config, opt)
     self.local_encoder2 = SelfAttention(self.config, opt)
     self.mha = SelfAttention(self.config, opt)
     self.pool = BertPooler(self.config)
     self.dropout = nn.Dropout(opt.dropout)
     self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim)
     self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
     self.classifier = nn.Linear(opt.embed_dim, 2)
Ejemplo n.º 6
0
    def setUp(self):

        self.monkeypatch = MonkeyPatch()
        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        config = BertConfig.from_json_file(config_path)
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
        self.monkeypatch.setattr(
            BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path)
        )

        super().setUp()
        self.set_up_model(
            self.FIXTURES_ROOT / "bert_srl" / "experiment.jsonnet",
            self.FIXTURES_ROOT / "conll_2012",
        )
Ejemplo n.º 7
0
 def __init__(self, embedding_matrix, opt):
     super(LCF_GLOVE, self).__init__()
     self.config = BertConfig.from_json_file("utils/bert_config.json")
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(
         torch.tensor(embedding_matrix, dtype=torch.float))
     self.mha_global = SelfAttention(self.config, opt)
     self.mha_local = SelfAttention(self.config, opt)
     self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim,
                                               dropout=self.opt.dropout)
     self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim,
                                              dropout=self.opt.dropout)
     self.mha_local_SA = SelfAttention(self.config, opt)
     self.mha_global_SA = SelfAttention(self.config, opt)
     self.pool = BertPooler(self.config)
     self.dropout = nn.Dropout(opt.dropout)
     self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim)
     self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
Ejemplo n.º 8
0
    def test_sliding_window_with_batch(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({
            "tokens":
            TextField(tokens + tokens + tokens, {"bert": token_indexer})
        })

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"])
        assert bert_vectors is not None

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"],
                                      token_type_ids=tokens["token_type_ids"])
        assert bert_vectors is not None
Ejemplo n.º 9
0
import torch
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
from transformers.modeling_bert import BertForMaskedLM, BertConfig
import MeCab

# Load the models
tokenizer = BertJapaneseTokenizer.from_pretrained('model/')
config = BertConfig.from_json_file('model/bert_base_32k_config.json')
model = BertForMaskedLM.from_pretrained('model/model.ckpt-580000_pytorch.bin',
                                        config=config)
m = MeCab.Tagger("-Ochasen")


def sent_emb(text):
    print('text:', text)
    input_ids = tokenizer.encode(text, return_tensors='pt')
    print('tokenizer.conert:',
          tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))

    masked_index = torch.where(
        input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    print('masked index:', masked_index)
    result = model(input_ids)
    pred_ids = result[0][:, masked_index].topk(10).indices.tolist()[0]

    output = []
    for pred_id in pred_ids:
        output_ids = input_ids.tolist()[0]
        output_ids[masked_index] = pred_id
        text = ''.join(tokenizer.decode(output_ids))
        #print(text)
Ejemplo n.º 10
0
"""
This file contains implementation of transformation tensorflow Bert model to pytorch representation.

"""

import torch
from transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert

# This script is used to convert tensorflow bert model to pytorch representation publicly known

# path to dictionary
bert_dir='/mnt/data/xkloco00_pc5/external/multi_cased_L-12_H-768_A-12'

# important files
tf_checkpoint_path=bert_dir+'/'+"bert_model.ckpt"
bert_config_file = bert_dir+'/'+"bert_config.json"
pytorch_dump_path=bert_dir+'/'+"pytorch_model.bin"

config = BertConfig.from_json_file(bert_config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = BertForPreTraining(config)

# Load weights from tf checkpoint
load_tf_weights_in_bert(model, config, tf_checkpoint_path)

# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
    def __init__(self, config: Config, output_encoded_layers: bool, *args,
                 **kwargs) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # Load config
        config_file = os.path.join(config.bert_cpt_dir, "bert_config.json")
        bert_config = BertConfig.from_json_file(config_file)
        print("Bert model config {}".format(bert_config))
        # Instantiate model.
        model = BertModel(bert_config)
        weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin")
        # load pre-trained weights if weights_path exists
        if config.load_weights and PathManager.isfile(weights_path):
            state_dict = torch.load(weights_path)

            missing_keys: List[str] = []
            unexpected_keys: List[str] = []
            error_msgs: List[str] = []
            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, "_metadata", None)
            tmp_state_dict = {}
            for key, value in state_dict.items():
                if key.endswith(
                        "LayerNorm.gamma"):  # compatibility with v0.5 models
                    key = key.replace("LayerNorm.gamma", "LayerNorm.weight")
                if key.endswith(
                        "LayerNorm.beta"):  # compatibility with v0.5 models
                    key = key.replace("LayerNorm.beta", "LayerNorm.bias")
                tmp_state_dict[key] = value
            state_dict = tmp_state_dict

            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=""):
                local_metadata = ({} if metadata is None else metadata.get(
                    prefix[:-1], {}))
                module._load_from_state_dict(
                    state_dict,
                    prefix,
                    local_metadata,
                    True,
                    missing_keys,
                    unexpected_keys,
                    error_msgs,
                )
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")

            load(model, prefix="" if hasattr(model, "bert") else "bert.")
            if len(missing_keys) > 0:
                print(
                    "Weights of {} not initialized from pretrained model: {}".
                    format(model.__class__.__name__, missing_keys))
            if len(unexpected_keys) > 0:
                print(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys))

        self.bert = model
        log_class_usage(__class__)