def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)

            torch.set_grad_enabled(False)
            self.torch_model = DistilBertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch(
                self.torch_model)
            # (batch_size, input_len, model_dim)
            self.inputs = torch.randint(low=0,
                                        high=self.cfg.vocab_size - 1,
                                        size=(batch_size, input_len),
                                        dtype=torch.long,
                                        device=self.test_device)
            self.attention_mask = torch.ones((batch_size, input_len),
                                             dtype=torch.long,
                                             device=self.test_device)
            self.head_mask = [None] * self.cfg.num_hidden_layers
 def __init__(self, bert_model_config: DistilBertConfig):
     super(DocumentDistilBertLSTM, self).__init__(bert_model_config)
     self.distilbert = DistilBertModel(bert_model_config)
     self.pooler = DistilBertPooler(bert_model_config)
     self.bert_batch_size = self.distilbert.config.bert_batch_size
     self.dropout = nn.Dropout(p=bert_model_config.dropout)
     self.lstm = LSTM(
         bert_model_config.hidden_size,
         bert_model_config.hidden_size,
     )
     self.classifier = nn.Sequential(
         nn.Dropout(p=bert_model_config.dropout),
         nn.Linear(bert_model_config.hidden_size,
                   bert_model_config.num_labels), nn.Tanh())
     self.init_weights()
Example #3
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a pretrained model by supplying

        * the name of a remote model on s3 ("distilbert-base-german-cased" ...)
        * OR a local path of a model trained via transformers ("some_dir/huggingface_model")
        * OR a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: The path of the saved pretrained model or its name.
        :type pretrained_model_name_or_path: str

        """

        distilbert = cls()
        if "farm_lm_name" in kwargs:
            distilbert.name = kwargs["farm_lm_name"]
        else:
            distilbert.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = os.path.join(pretrained_model_name_or_path,
                                      "language_model_config.json")
        if os.path.exists(farm_lm_config):
            # FARM style
            distilbert_config = DistilBertConfig.from_pretrained(
                farm_lm_config)
            farm_lm_model = os.path.join(pretrained_model_name_or_path,
                                         "language_model.bin")
            distilbert.model = DistilBertModel.from_pretrained(
                farm_lm_model, config=distilbert_config, **kwargs)
            distilbert.language = distilbert.model.config.language
        else:
            # Pytorch-transformer Style
            distilbert.model = DistilBertModel.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
            distilbert.language = cls._infer_language_from_name(
                pretrained_model_name_or_path)
        config = distilbert.model.config

        # DistilBERT does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler.
        # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim).
        # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we
        # feed everything to the prediction head
        config.summary_last_dropout = 0
        config.summary_type = 'first'
        config.summary_activation = 'tanh'
        distilbert.pooler = SequenceSummary(config)
        distilbert.pooler.apply(distilbert.model._init_weights)
        return distilbert
Example #4
0
 def __init__(self, config):
     super(DistilBertCrfForNer, self).__init__(config)
     self.distilbert = DistilBertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.crf = CRF(num_tags=config.num_labels, batch_first=True)
     self.init_weights()
Example #5
0
    def __init__(self,
                 config,
                 hdt_file='wikidata2018_09_11.hdt',
                 topk_entities=20,
                 topk_predicates=50,
                 bottleneck_dim=32,
                 seq_classif_dropout=0.9):
        super(MessagePassingHDTBert, self).__init__(config)

        # entity matching Transformer
        self.bert = DistilBertModel(config)
        #         self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim)

        # initialise weights for the linear layer to select a few
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        self.dropout = nn.Dropout(seq_classif_dropout)

        # sampling layer with subgraph retrieval
        self.subgraph_sampling = SamplingLayer(hdt_path + hdt_file,
                                               topk_entities, topk_predicates)

        # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph
        self.mp = MPLayer()

        self.init_weights()
    def __init__(self, config):
        super(DistilImageBertForMultipleChoice, self).__init__(config)
        self.loss_type = config.loss_type
        if config.img_feature_dim > 0:
            self.bert = DistilBertImgModel(config)
        else:
            self.bert = DistilBertModel(config)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        if hasattr(config, 'classifier'):
            if not hasattr(config, 'cls_hidden_scale'):
                config.cls_hidden_scale = 2
            if config.classifier == 'linear':
                self.classifier = nn.Linear(
                    config.num_choice * config.hidden_size,
                    self.config.num_labels)
            elif config.classifier == 'mlp':
                self.classifier = nn.Sequential(
                    nn.Linear(config.num_choice * config.hidden_size,
                              config.hidden_size * config.cls_hidden_scale),
                    nn.ReLU(),
                    nn.Linear(config.hidden_size * config.cls_hidden_scale,
                              self.config.num_labels))
        else:
            self.classifier = nn.Linear(config.num_choice * config.hidden_size,
                                        self.config.num_labels)  # original

        self.apply(self.init_weights)
Example #7
0
 def __init__(self, config):
     super(DistilBertSoftmaxForNer, self).__init__(config)
     self.num_labels = config.num_labels
     self.distilbert = DistilBertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.loss_type = config.loss_type
     self.init_weights()
Example #8
0
    def __init__(self, config, num_classes=None):
        super().__init__(config)

        self.bert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size, num_classes)

        self.init_weights()
Example #9
0
 def __init__(self, config):
     super(MessagePassingBert, self).__init__(config)
     self.bert = DistilBertModel(config)
     #         self.dropout = nn.Dropout(0.1)
     self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
     # the predicted score is then propagated via a message-passing layer
     self.mp = MPLayer()
     self.init_weights()
Example #10
0
    def __init__(self, config, weight=None):
        super(DistilBertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.weight = weight

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointDistilBERT, self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.distilbert = DistilBertModel(config=config)  # Load pretrained bert

        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate)
        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)

        if args.use_crf:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.pre_classifier_t1 = nn.Linear(config.dim, config.dim)
        self.pre_classifier_t2 = nn.Linear(config.dim, config.dim)
        self.classifier_t1 = nn.Linear(config.dim, config.num_labels)
        self.classifier_t2 = nn.Linear(config.dim, config.num_labels)
        self.dropout_t1 = nn.Dropout(config.seq_classif_dropout)
        self.dropout_t2 = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()
Example #13
0
 def __init__(
     self,
     config,
 ):
     super(DistilBertSpanForNer, self).__init__(config)
     self.soft_label = config.soft_label
     self.num_labels = config.num_labels
     self.loss_type = config.loss_type
     self.distilbert = DistilBertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels)
     if self.soft_label:
         self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels,
                                       self.num_labels)
     else:
         self.end_fc = PoolerEndLogits(config.hidden_size + 1,
                                       self.num_labels)
     self.init_weights()
Example #14
0
def read_parse_write(bert: DistilBertModel,
                     bert_path: str,
                     infile: str,
                     outfile: str,
                     mode: str = "average",
                     batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param bert: Bert embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)

    dataset = CustomDataset(all_sents, bert_path)

    batch_size = max(1, batch_size)  # make sure batch_size is gt 0
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4)
    for _, (batch, n_pads) in tqdm(enumerate(dataloader)):
        with torch.no_grad():
            batch = batch.cuda() if CUDA else batch
            bert = bert.cuda() if CUDA else bert

            bert_batch_vecs = bert(batch)[0].cpu().numpy()
            vectors = parse_sentence(bert_batch_vecs, mode=mode)
            for j in range(vectors.shape[0]):
                all_vecs.append(vectors[j, :-n_pads[j], :])

    print("Finishing embedding Bert sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
    def __init__(self,
                 config,
                 hdt_file='wikidata2018_09_11.hdt',
                 topk_entities=10,
                 bottleneck_dim=32):
        super(MessagePassingHDTBert, self).__init__(config)

        # entity matching Transformer
        self.bert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim)
        self.classifier = nn.Linear(bottleneck_dim, self.config.num_labels)

        # initialise connection to the Wikidata KG through the HDT API
        kg = HDTDocument(hdt_path + hdt_file)
        # sampling layer with subgraph retrieval
        self.subgraph_sampling = SamplingLayer(kg, topk_entities)

        # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph
        self.mp = MPLayer()

        self.init_weights()
    class TestDistillBertModel(unittest.TestCase):
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)

            torch.set_grad_enabled(False)
            self.torch_model = DistilBertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch(
                self.torch_model)
            # (batch_size, input_len, model_dim)
            self.inputs = torch.randint(low=0,
                                        high=self.cfg.vocab_size - 1,
                                        size=(batch_size, input_len),
                                        dtype=torch.long,
                                        device=self.test_device)
            self.attention_mask = torch.ones((batch_size, input_len),
                                             dtype=torch.long,
                                             device=self.test_device)
            self.head_mask = [None] * self.cfg.num_hidden_layers

        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"

            torch_model = lambda: self.torch_model(self.inputs, self.
                                                   attention_mask)
            torch_res, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter)

            print(
                f"DistillBertModel \"({batch_size}, {input_len:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_res = lambda: self.turbo_transformer(
                self.inputs, self.attention_mask, head_mask=self.head_mask)
            with turbo_transformers.pref_guard("gpref_test") as perf:
                turbo_res, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_res, use_cuda, num_iter)

            print(
                f"DistillBertModel \"({batch_size}, {input_len:03})\" ",
                f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}")

            self.assertTrue(
                torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2
                if use_cuda else 1e-3)

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n"
                )

        def test_distrill_bert_model(self):
            self.check_torch_and_turbo(use_cuda=False)
            if torch.cuda.is_available() and \
                    turbo_transformers.config.is_compiled_with_cuda():
                self.check_torch_and_turbo(use_cuda=True)
Example #17
0
 def __init__(self, distilbert_config, out_dim, dropout=0.1):
     super().__init__(distilbert_config)
     self.distilbert = DistilBertModel(distilbert_config)
     self.classifier = nn.Linear(768, out_dim)
     self.dropout = nn.Dropout(dropout)
     self.init_weights()
Example #18
0
 def __init__(self, distilbert_config, dropout=0.1):
     super(DistilBERTForMultipleChoice, self).__init__(distilbert_config)
     self.distilbert = DistilBertModel(distilbert_config)
     self.dropout = nn.Dropout(dropout)
     self.classifier = nn.Linear(768, 1)
     self.init_weights()
class DocumentDistilBertLSTM(DistilBertPreTrainedModel):
    """
    DistilBERT output over document in LSTM
    """
    def __init__(self, bert_model_config: DistilBertConfig):
        super(DocumentDistilBertLSTM, self).__init__(bert_model_config)
        self.distilbert = DistilBertModel(bert_model_config)
        self.pooler = DistilBertPooler(bert_model_config)
        self.bert_batch_size = self.distilbert.config.bert_batch_size
        self.dropout = nn.Dropout(p=bert_model_config.dropout)
        self.lstm = LSTM(
            bert_model_config.hidden_size,
            bert_model_config.hidden_size,
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p=bert_model_config.dropout),
            nn.Linear(bert_model_config.hidden_size,
                      bert_model_config.num_labels), nn.Tanh())
        self.init_weights()

    #input_ids, token_type_ids, attention_masks
    def forward(self,
                document_batch: torch.Tensor,
                document_sequence_lengths: list,
                device='cuda'):

        #contains all BERT sequences
        #bert should output a (batch_size (i.e. number of documents), num_sequences , bert_hidden_size)
        distilbert_output = torch.zeros(
            size=(document_batch.shape[0],
                  min(document_batch.shape[1], self.bert_batch_size),
                  self.distilbert.config.hidden_size),
            dtype=torch.float,
            device=device)

        #only pass through bert_batch_size numbers of inputs into bert.
        #this means that we are possibly cutting off the last part of documents.

        for doc_id in range(document_batch.shape[0]):

            hidden_states = self.distilbert(
                input_ids=document_batch[doc_id][:self.bert_batch_size, 0],
                attention_mask=document_batch[doc_id][:self.bert_batch_size,
                                                      2])[0]
            #Output of distilbert is a tuple of length 1. First element (hidden_states) is of shape:
            #( num_sequences(i.e. nr of sequences per document), nr_of_tokens(512) (i.e. nr of tokens per sequence), bert_hidden_size )

            pooled_output = self.pooler(
                hidden_states
            )  # (num_sequences (i.e. nr of sequences per document), bert_hidden_size)

            distilbert_output[doc_id][:self.bert_batch_size] = self.dropout(
                pooled_output
            )  #( #batch_size(i.e. number of documents) ,num_sequences (i.e. nr of sequences per document), bert_hidden_size)

        #lstm expects a ( num_sequences, batch_size (i.e. number of documents) , bert_hidden_size )
        self.lstm.flatten_parameters()
        output, (_, _) = self.lstm(distilbert_output.permute(1, 0, 2))

        last_layer = output[-1]

        prediction = self.classifier(last_layer)
        assert prediction.shape[0] == document_batch.shape[0]
        return prediction

    def freeze_bert_encoder(self):
        for param in self.distilbert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.distilbert.parameters():
            param.requires_grad = True

    def unfreeze_bert_encoder_last_layers(self):
        for name, param in self.distilbert.named_parameters():
            if "layer.5" in name or "pooler" in name:
                param.requires_grad = True

    def unfreeze_bert_encoder_pooler_layer(self):
        for name, param in self.distilbert.named_parameters():
            if "pooler" in name:
                param.requires_grad = True