コード例 #1
0
    def __init__(self, taskmodels_dict, encoder=None):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
コード例 #2
0
 def __init__(self, backbone, taskmodels_dict, backbone_name):
     """
     Setting MultitaskModel up as a PretrainedModel allows us
     to take better advantage of Trainer features
     """
     super().__init__(transformers.PretrainedConfig())
     self.backbone_name = backbone_name
     self.backbone = backbone
     self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
コード例 #3
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        # Preprocessing the datasets
        if self.hparams.finetuning_task is not None:
            sentence1_key, sentence2_key = task_to_keys[
                self.hparams.finetuning_task]
        else:
            # We try to have some nice defaults but don't hesitate to tweak to your use case.
            non_label_column_names = [
                name for name in self.raw_datasets["train"].column_names
                if name != "label"
            ]
            if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
                sentence1_key, sentence2_key = "sentence1", "sentence2"
            else:
                if len(non_label_column_names) >= 2:
                    sentence1_key, sentence2_key = non_label_column_names[:2]
                else:
                    sentence1_key, sentence2_key = non_label_column_names[
                        0], None

        # Padding strategy
        if self.data_config.pad_to_max_length:
            padding = "max_length"
        else:
            # We will pad later, dynamically at batch creation to the max_seq_length in each batch.
            padding = False

        # Some models have set the order of the labels to use, so let's make sure we do use it.
        label_to_id = None
        if (self.model.config.label2id != transformers.PretrainedConfig(
                num_labels=self.hparams.num_labels).label2id
                and self.hparams.finetuning_task is not None
                and not self.is_regression):
            # Some have all caps in their config, some don't.
            label_name_to_id = {
                k.lower(): v
                for k, v in self.model.config.label2id.items()
            }
            if sorted(label_name_to_id.keys()) == sorted(self.label_list):
                label_to_id = {
                    i: label_name_to_id[self.label_list[i]]
                    for i in range(self.hparams.num_labels)
                }
            else:
                self.logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the "
                    f"dataset: model labels: {sorted(label_name_to_id.keys())}, "
                    f"dataset labels: {sorted(self.label_list)}."
                    "\nIgnoring the model labels as a result.", )
        elif self.hparams.finetuning_task is None and not self.is_regression:
            label_to_id = {v: i for i, v in enumerate(self.label_list)}

        if self.data_config.max_seq_length > self.tokenizer.model_max_length:
            self.logger.warning(
                f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger than "
                f"the maximum length for the model ({self.tokenizer.model_max_length}). Using "
                f"max_seq_length={self.tokenizer.model_max_length}.")
        max_seq_length = min(self.data_config.max_seq_length,
                             self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the preprocess_function if we
        # want map to be able to cache the output of the tokenizer.  Hence, the preprocess_function
        # takes a tokenizer explicitly as an input and we create a closure using functools.partial.
        def preprocess_function(tokenizer, padding, max_seq_length, examples):
            # Tokenize the texts
            args = ((examples[sentence1_key], ) if sentence2_key is None else
                    (examples[sentence1_key], examples[sentence2_key]))
            result = tokenizer(*args,
                               padding=padding,
                               max_length=max_seq_length,
                               truncation=True)

            # Map labels to IDs (not necessary for GLUE tasks)
            if label_to_id is not None and "label" in examples:
                result["label"] = [
                    label_to_id[label] for label in examples["label"]
                ]
            return result

        tokenized_datasets = self.raw_datasets.map(
            functools.partial(preprocess_function, self.tokenizer, padding,
                              max_seq_length),
            batched=True,
            load_from_cache_file=not self.data_config.overwrite_cache,
        )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)

        # Data collator will default to DataCollatorWithPadding, so we change it if we already
        # did the padding.
        if self.data_config.pad_to_max_length:
            self.collator = transformers.default_data_collator
        elif self.hparams.use_apex_amp:
            collator = transformers.DataCollatorWithPadding(
                self.tokenizer, pad_to_multiple_of=8)
            self.collator = lambda x: collator(x).data
        else:
            self.collator = None
        return tokenized_datasets
コード例 #4
0
from torch.nn import functional as F
from transformers import BertModel
from torch import nn
from torchcrf import CRF
import transformers
nltconfig = transformers.PretrainedConfig(name_or_path='bert-base-chinese')


class nluModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = BertModel.from_pretrained(self.config.pretrain_model_path)
        self.drop = nn.Dropout(p=0.2)
        self.num_intent_label = len(self.config.intent_vocab)
        self.num_slot_label = len(self.config.slot_vocab)
        self.hid_size = self.bert.config.hidden_size
        self.fc_intent = nn.Linear(self.hid_size, self.num_intent_label)
        self.fc_slot = nn.Linear(self.hid_size, self.num_slot_label)
        self.crf = CRF(self.config.num_slot, batch_first=True).cuda()

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.bert(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        seq_encoding, pooled_output = out[0], out[1]

        seq_encoding = self.drop(seq_encoding)  # bs,seq_len,hid_size
        pooled_output = self.drop(pooled_output)  # bs,hid_size

        intent_logits = self.fc_intent(pooled_output)  #bs,num_intent_label