def init_data(self, use_cuda: bool):
            test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')

            torch.set_grad_enabled(False)
            cfg = AlbertConfig()
            self.torch_embedding = AlbertEmbeddings(cfg)

            self.torch_embedding.eval()

            if use_cuda:
                self.torch_embedding.to(test_device)

            self.turbo_embedding = turbo_transformers.AlbertEmbeddings.from_torch(
                self.torch_embedding)

            input_ids = torch.randint(low=0,
                                      high=cfg.vocab_size - 1,
                                      size=(batch_size, seq_length),
                                      dtype=torch.long,
                                      device=test_device)
            position_ids = torch.arange(seq_length,
                                        dtype=torch.long,
                                        device=input_ids.device)

            position_ids = position_ids.repeat(batch_size, 1)
            token_type_ids = torch.zeros_like(input_ids, dtype=torch.long)

            return input_ids, position_ids, token_type_ids
Exemple #2
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a language model either by supplying

        * the name of a remote model on s3 ("albert-base" ...)
        * or a local path of a model trained via transformers ("some_dir/huggingface_model")
        * or a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: name or path of a model
        :param language: (Optional) Name of language the model was trained for (e.g. "german").
                         If not supplied, FARM will try to infer it from the model name.
        :return: Language Model

        """
        albert = cls()
        if "farm_lm_name" in kwargs:
            albert.name = kwargs["farm_lm_name"]
        else:
            albert.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            config = AlbertConfig.from_pretrained(farm_lm_config)
            farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin"
            albert.model = AlbertModel.from_pretrained(farm_lm_model, config=config, **kwargs)
            albert.language = albert.model.config.language
        else:
            # Huggingface transformer Style
            albert.model = AlbertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs)
            albert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path)
        return albert
Exemple #3
0
        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = AlbertConfig()

            self.torch_layer = AlbertLayer(self.cfg)
            if torch.cuda.is_available():
                self.torch_layer.to(self.test_device)
            self.torch_layer.eval()
            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.rand(size=(batch_size, seq_length,
                                                 self.hidden_size),
                                           dtype=torch.float32,
                                           device=self.test_device)

            self.attention_mask = torch.ones((batch_size, seq_length),
                                             dtype=torch.float32,
                                             device=self.test_device)
            self.attention_mask = self.attention_mask[:, None, None, :]
            self.attention_mask = (1.0 - self.attention_mask) * -10000.0

            self.turbo_layer = turbo_transformers.AlbertLayer.from_torch(
                self.torch_layer)
Exemple #4
0
        def init_data(self, use_cuda):
            test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            cfg = AlbertConfig(attention_probs_dropout_prob=0.0,
                               hidden_dropout_prob=0.0)
            torch_attention = AlbertAttention(cfg)
            torch_attention.eval()
            if use_cuda:
                torch_attention.to(test_device)

            # Get FT Attention
            turbo_attention = turbo_transformers.AlbertAttention.from_torch(
                torch_attention)
            hidden_size = cfg.hidden_size
            input_tensor = torch.rand(size=(batch_size, seq_length,
                                            hidden_size),
                                      dtype=torch.float32,
                                      device=test_device)
            attention_mask = torch.ones((batch_size, seq_length),
                                        dtype=torch.float32,
                                        device=test_device)
            attention_mask = attention_mask[:, None, None, :]
            attention_mask = (1.0 - attention_mask) * -10000.0
            return torch_attention, turbo_attention, input_tensor, attention_mask
Exemple #5
0
    def __init__(self, cfg):
        super(DSB_ALBERTModel, self).__init__()
        self.cfg = cfg
        cate_col_size = len(cfg.cate_cols)
        cont_col_size = len(cfg.cont_cols)
        self.cate_emb = nn.Embedding(cfg.total_cate_size,
                                     cfg.emb_size,
                                     padding_idx=0)

        def get_cont_emb():
            return nn.Sequential(nn.Linear(cont_col_size, cfg.hidden_size),
                                 nn.LayerNorm(cfg.hidden_size), nn.ReLU(),
                                 nn.Linear(cfg.hidden_size, cfg.hidden_size))

        self.cont_emb = get_cont_emb()
        self.config = AlbertConfig(
            3,  # not used
            embedding_size=cfg.emb_size * cate_col_size + cfg.hidden_size,
            hidden_size=cfg.emb_size * cate_col_size + cfg.hidden_size,
            num_hidden_layers=cfg.nlayers,
            #num_hidden_groups=1,
            num_attention_heads=cfg.nheads,
            intermediate_size=cfg.hidden_size,
            hidden_dropout_prob=cfg.dropout,
            attention_probs_dropout_prob=cfg.dropout,
            max_position_embeddings=cfg.seq_len,
            type_vocab_size=1,
            #initializer_range=0.02,
            #layer_norm_eps=1e-12,
        )

        self.encoder = AlbertModel(self.config)

        def get_reg():
            return nn.Sequential(
                nn.Linear(cfg.emb_size * cate_col_size + cfg.hidden_size,
                          cfg.hidden_size),
                nn.LayerNorm(cfg.hidden_size),
                nn.Dropout(cfg.dropout),
                nn.ReLU(),
                nn.Linear(cfg.hidden_size, cfg.hidden_size),
                nn.LayerNorm(cfg.hidden_size),
                nn.Dropout(cfg.dropout),
                nn.ReLU(),
                nn.Linear(cfg.hidden_size, cfg.target_size),
            )

        self.reg_layer = get_reg()
Exemple #6
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a pretrained model by supplying

        * the name of a remote model on s3 ("distilbert-base-german-cased" ...)
        * OR a local path of a model trained via transformers ("some_dir/huggingface_model")
        * OR a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: The path of the saved pretrained model or its name.
        :type pretrained_model_name_or_path: str

        """

        distilbert = cls()
        if "farm_lm_name" in kwargs:
            distilbert.name = kwargs["farm_lm_name"]
        else:
            distilbert.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(
            pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            config = AlbertConfig.from_pretrained(farm_lm_config)
            farm_lm_model = Path(
                pretrained_model_name_or_path) / "language_model.bin"
            distilbert.model = DistilBertModel.from_pretrained(farm_lm_model,
                                                               config=config,
                                                               **kwargs)
            distilbert.language = distilbert.model.config.language
        else:
            # Pytorch-transformer Style
            distilbert.model = DistilBertModel.from_pretrained(
                str(pretrained_model_name_or_path), **kwargs)
            distilbert.language = cls._get_or_infer_language_from_name(
                language, pretrained_model_name_or_path)
        config = distilbert.model.config

        # DistilBERT does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler.
        # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim).
        # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we
        # feed everything to the prediction head
        config.summary_last_dropout = 0
        config.summary_type = 'first'
        config.summary_activation = 'tanh'
        distilbert.pooler = SequenceSummary(config)
        distilbert.pooler.apply(distilbert.model._init_weights)
        return distilbert
    def init_data(self, use_cuda: bool) -> None:
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')
        if not use_cuda:
            torch.set_num_threads(4)
            turbo_transformers.set_num_threads(4)

        torch.set_grad_enabled(False)
        self.cfg = AlbertConfig(hidden_size=768,
                                num_attention_heads=12,
                                intermediate_size=3072)
        self.torch_model = AlbertModel(self.cfg)

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)
        self.torch_model.eval()
        self.hidden_size = self.cfg.hidden_size

        self.turbo_model = turbo_transformers.AlbertModel.from_torch(
            self.torch_model)
        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = AlbertConfig()

            self.torch_model = AlbertModel(self.cfg)
            if torch.cuda.is_available():
                self.torch_model.to(self.test_device)
            self.torch_model.eval()
            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.randint(low=0,
                                              high=self.cfg.vocab_size - 1,
                                              size=(batch_size, seq_length),
                                              device=self.test_device)

            self.turbo_model = turbo_transformers.AlbertModel.from_torch(
                self.torch_model)
Exemple #9
0
from utils.evaluate import evaluate
from torch.optim import lr_scheduler
from torch import nn
if __name__ == "__main__":
    train_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/train.txt"
    dev_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/dev.txt"
    test_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/test.txt"
    vocab_path = "/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/vocab.txt"

    train_data_loader = LCQMCDataLoader(
        train_data_path, vocab_path, batch_size=1024, is_pair=True, length=80)
    dev_data_loader = LCQMCDataLoader(
        dev_data_path, vocab_path, batch_size=1024, is_pair=True, length=80)
    test_data_loader = LCQMCDataLoader(
        test_data_path, vocab_path, batch_size=1024, is_pair=True, length=80)
    config = AlbertConfig.from_pretrained(
        "/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/config.json")

    config.num_labels = 1
    # config.hidden_size = 128
    config.dropout = 0.5
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')
    net = AlbertForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path="/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/pytorch_model.bin",
        config=config).to(device)
    # %%
    learning_rate = 5e-4
    no_decay = ["bias", "LayerNorm.weight"]
    bert_param_optimizer = list(net.bert.named_parameters())
    linear_param_optimizer = list(net.classifier.named_parameters())
    optimizer_grouped_parameters = [