def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch.set_grad_enabled(False) self.torch_model = DistilBertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch( self.torch_model) # (batch_size, input_len, model_dim) self.inputs = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, input_len), dtype=torch.long, device=self.test_device) self.attention_mask = torch.ones((batch_size, input_len), dtype=torch.long, device=self.test_device) self.head_mask = [None] * self.cfg.num_hidden_layers
def __init__(self, bert_model_config: DistilBertConfig): super(DocumentDistilBertLSTM, self).__init__(bert_model_config) self.distilbert = DistilBertModel(bert_model_config) self.pooler = DistilBertPooler(bert_model_config) self.bert_batch_size = self.distilbert.config.bert_batch_size self.dropout = nn.Dropout(p=bert_model_config.dropout) self.lstm = LSTM( bert_model_config.hidden_size, bert_model_config.hidden_size, ) self.classifier = nn.Sequential( nn.Dropout(p=bert_model_config.dropout), nn.Linear(bert_model_config.hidden_size, bert_model_config.num_labels), nn.Tanh()) self.init_weights()
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a pretrained model by supplying * the name of a remote model on s3 ("distilbert-base-german-cased" ...) * OR a local path of a model trained via transformers ("some_dir/huggingface_model") * OR a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :type pretrained_model_name_or_path: str """ distilbert = cls() if "farm_lm_name" in kwargs: distilbert.name = kwargs["farm_lm_name"] else: distilbert.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = os.path.join(pretrained_model_name_or_path, "language_model_config.json") if os.path.exists(farm_lm_config): # FARM style distilbert_config = DistilBertConfig.from_pretrained( farm_lm_config) farm_lm_model = os.path.join(pretrained_model_name_or_path, "language_model.bin") distilbert.model = DistilBertModel.from_pretrained( farm_lm_model, config=distilbert_config, **kwargs) distilbert.language = distilbert.model.config.language else: # Pytorch-transformer Style distilbert.model = DistilBertModel.from_pretrained( pretrained_model_name_or_path, **kwargs) distilbert.language = cls._infer_language_from_name( pretrained_model_name_or_path) config = distilbert.model.config # DistilBERT does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head config.summary_last_dropout = 0 config.summary_type = 'first' config.summary_activation = 'tanh' distilbert.pooler = SequenceSummary(config) distilbert.pooler.apply(distilbert.model._init_weights) return distilbert
def __init__(self, config): super(DistilBertCrfForNer, self).__init__(config) self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights()
def __init__(self, config, hdt_file='wikidata2018_09_11.hdt', topk_entities=20, topk_predicates=50, bottleneck_dim=32, seq_classif_dropout=0.9): super(MessagePassingHDTBert, self).__init__(config) # entity matching Transformer self.bert = DistilBertModel(config) # self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim) # initialise weights for the linear layer to select a few self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.dropout = nn.Dropout(seq_classif_dropout) # sampling layer with subgraph retrieval self.subgraph_sampling = SamplingLayer(hdt_path + hdt_file, topk_entities, topk_predicates) # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph self.mp = MPLayer() self.init_weights()
def __init__(self, config): super(DistilImageBertForMultipleChoice, self).__init__(config) self.loss_type = config.loss_type if config.img_feature_dim > 0: self.bert = DistilBertImgModel(config) else: self.bert = DistilBertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) if hasattr(config, 'classifier'): if not hasattr(config, 'cls_hidden_scale'): config.cls_hidden_scale = 2 if config.classifier == 'linear': self.classifier = nn.Linear( config.num_choice * config.hidden_size, self.config.num_labels) elif config.classifier == 'mlp': self.classifier = nn.Sequential( nn.Linear(config.num_choice * config.hidden_size, config.hidden_size * config.cls_hidden_scale), nn.ReLU(), nn.Linear(config.hidden_size * config.cls_hidden_scale, self.config.num_labels)) else: self.classifier = nn.Linear(config.num_choice * config.hidden_size, self.config.num_labels) # original self.apply(self.init_weights)
def __init__(self, config): super(DistilBertSoftmaxForNer, self).__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.loss_type = config.loss_type self.init_weights()
def __init__(self, config, num_classes=None): super().__init__(config) self.bert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, num_classes) self.init_weights()
def __init__(self, config): super(MessagePassingBert, self).__init__(config) self.bert = DistilBertModel(config) # self.dropout = nn.Dropout(0.1) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) # the predicted score is then propagated via a message-passing layer self.mp = MPLayer() self.init_weights()
def __init__(self, config, weight=None): super(DistilBertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.weight = weight self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights()
def __init__(self, config, args, intent_label_lst, slot_label_lst): super(JointDistilBERT, self).__init__(config) self.args = args self.num_intent_labels = len(intent_label_lst) self.num_slot_labels = len(slot_label_lst) self.distilbert = DistilBertModel(config=config) # Load pretrained bert self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate) self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate) if args.use_crf: self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier_t1 = nn.Linear(config.dim, config.dim) self.pre_classifier_t2 = nn.Linear(config.dim, config.dim) self.classifier_t1 = nn.Linear(config.dim, config.num_labels) self.classifier_t2 = nn.Linear(config.dim, config.num_labels) self.dropout_t1 = nn.Dropout(config.seq_classif_dropout) self.dropout_t2 = nn.Dropout(config.seq_classif_dropout) self.init_weights()
def __init__( self, config, ): super(DistilBertSpanForNer, self).__init__(config) self.soft_label = config.soft_label self.num_labels = config.num_labels self.loss_type = config.loss_type self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels) if self.soft_label: self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels) else: self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels) self.init_weights()
def read_parse_write(bert: DistilBertModel, bert_path: str, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None: """ Read the input files and write the vectors to the output files :param bert: Bert embedder :param infile: input files for the sentences :param outfile: output vector files :param mode: the mode of elmo vectors :return: """ reader = Reader() insts = reader.read_txt(infile, -1) f = open(outfile, 'wb') all_vecs = [] all_sents = [] for inst in insts: all_sents.append(inst.input.words) dataset = CustomDataset(all_sents, bert_path) batch_size = max(1, batch_size) # make sure batch_size is gt 0 dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4) for _, (batch, n_pads) in tqdm(enumerate(dataloader)): with torch.no_grad(): batch = batch.cuda() if CUDA else batch bert = bert.cuda() if CUDA else bert bert_batch_vecs = bert(batch)[0].cpu().numpy() vectors = parse_sentence(bert_batch_vecs, mode=mode) for j in range(vectors.shape[0]): all_vecs.append(vectors[j, :-n_pads[j], :]) print("Finishing embedding Bert sequences, saving the vector files.") pickle.dump(all_vecs, f) f.close()
def __init__(self, config, hdt_file='wikidata2018_09_11.hdt', topk_entities=10, bottleneck_dim=32): super(MessagePassingHDTBert, self).__init__(config) # entity matching Transformer self.bert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim) self.classifier = nn.Linear(bottleneck_dim, self.config.num_labels) # initialise connection to the Wikidata KG through the HDT API kg = HDTDocument(hdt_path + hdt_file) # sampling layer with subgraph retrieval self.subgraph_sampling = SamplingLayer(kg, topk_entities) # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph self.mp = MPLayer() self.init_weights()
class TestDistillBertModel(unittest.TestCase): def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch.set_grad_enabled(False) self.torch_model = DistilBertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch( self.torch_model) # (batch_size, input_len, model_dim) self.inputs = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, input_len), dtype=torch.long, device=self.test_device) self.attention_mask = torch.ones((batch_size, input_len), dtype=torch.long, device=self.test_device) self.head_mask = [None] * self.cfg.num_hidden_layers def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_model(self.inputs, self. attention_mask) torch_res, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_res = lambda: self.turbo_transformer( self.inputs, self.attention_mask, head_mask=self.head_mask) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_res, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_res, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2 if use_cuda else 1e-3) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" ) def test_distrill_bert_model(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)
def __init__(self, distilbert_config, out_dim, dropout=0.1): super().__init__(distilbert_config) self.distilbert = DistilBertModel(distilbert_config) self.classifier = nn.Linear(768, out_dim) self.dropout = nn.Dropout(dropout) self.init_weights()
def __init__(self, distilbert_config, dropout=0.1): super(DistilBERTForMultipleChoice, self).__init__(distilbert_config) self.distilbert = DistilBertModel(distilbert_config) self.dropout = nn.Dropout(dropout) self.classifier = nn.Linear(768, 1) self.init_weights()
class DocumentDistilBertLSTM(DistilBertPreTrainedModel): """ DistilBERT output over document in LSTM """ def __init__(self, bert_model_config: DistilBertConfig): super(DocumentDistilBertLSTM, self).__init__(bert_model_config) self.distilbert = DistilBertModel(bert_model_config) self.pooler = DistilBertPooler(bert_model_config) self.bert_batch_size = self.distilbert.config.bert_batch_size self.dropout = nn.Dropout(p=bert_model_config.dropout) self.lstm = LSTM( bert_model_config.hidden_size, bert_model_config.hidden_size, ) self.classifier = nn.Sequential( nn.Dropout(p=bert_model_config.dropout), nn.Linear(bert_model_config.hidden_size, bert_model_config.num_labels), nn.Tanh()) self.init_weights() #input_ids, token_type_ids, attention_masks def forward(self, document_batch: torch.Tensor, document_sequence_lengths: list, device='cuda'): #contains all BERT sequences #bert should output a (batch_size (i.e. number of documents), num_sequences , bert_hidden_size) distilbert_output = torch.zeros( size=(document_batch.shape[0], min(document_batch.shape[1], self.bert_batch_size), self.distilbert.config.hidden_size), dtype=torch.float, device=device) #only pass through bert_batch_size numbers of inputs into bert. #this means that we are possibly cutting off the last part of documents. for doc_id in range(document_batch.shape[0]): hidden_states = self.distilbert( input_ids=document_batch[doc_id][:self.bert_batch_size, 0], attention_mask=document_batch[doc_id][:self.bert_batch_size, 2])[0] #Output of distilbert is a tuple of length 1. First element (hidden_states) is of shape: #( num_sequences(i.e. nr of sequences per document), nr_of_tokens(512) (i.e. nr of tokens per sequence), bert_hidden_size ) pooled_output = self.pooler( hidden_states ) # (num_sequences (i.e. nr of sequences per document), bert_hidden_size) distilbert_output[doc_id][:self.bert_batch_size] = self.dropout( pooled_output ) #( #batch_size(i.e. number of documents) ,num_sequences (i.e. nr of sequences per document), bert_hidden_size) #lstm expects a ( num_sequences, batch_size (i.e. number of documents) , bert_hidden_size ) self.lstm.flatten_parameters() output, (_, _) = self.lstm(distilbert_output.permute(1, 0, 2)) last_layer = output[-1] prediction = self.classifier(last_layer) assert prediction.shape[0] == document_batch.shape[0] return prediction def freeze_bert_encoder(self): for param in self.distilbert.parameters(): param.requires_grad = False def unfreeze_bert_encoder(self): for param in self.distilbert.parameters(): param.requires_grad = True def unfreeze_bert_encoder_last_layers(self): for name, param in self.distilbert.named_parameters(): if "layer.5" in name or "pooler" in name: param.requires_grad = True def unfreeze_bert_encoder_pooler_layer(self): for name, param in self.distilbert.named_parameters(): if "pooler" in name: param.requires_grad = True