Ejemplo n.º 1
0
 def from_torch(model: TorchBertModel,
                device: Optional[torch.device] = None):
     if device is not None and 'cuda' in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     embeddings = BertEmbeddings.from_torch(model.embeddings)
     encoder = BertEncoder.from_torch(model.encoder)
     return BertModelNoPooler(embeddings, encoder)
 def from_torch(
         model: TorchBertModel,
         device: Optional[torch.device] = None  # from_torch函数实现
 ):
     if device is not None and "cuda" in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     bertmodel = turbo_transformers.BertModel.from_torch(model.bert)
     # We can copy the following code and do not change it
     # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member,
     # they need modify it here.
     return BertForSequenceClassification(bertmodel, model.classifier)
Ejemplo n.º 3
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.pos_weight = torch.ones(self.config.num_labels,
                                     requires_grad=False)

        self.init_weights()
    def __init__(self, tagset_size):
        super(BertForSequenceClassification, self).__init__()
        self.tagset_size = tagset_size

        self.BertModel_single = BertModel.from_pretrained(pretrain_model_dir)
        self.single_hidden2tag = BertClassificationHead(
            bert_hidden_dim, tagset_size)
Ejemplo n.º 5
0
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")
Ejemplo n.º 6
0
 def from_pretrained(model_id_or_path: str,
                     device: Optional[torch.device] = None):
     torch_model = TorchBertModel.from_pretrained(model_id_or_path)
     model = BertModelNoPooler.from_torch(torch_model, device)
     model.config = torch_model.config
     model._torch_model = torch_model  # prevent destroy torch model.
     return model
Ejemplo n.º 7
0
    def __init__(self,
                 args,
                 tokenizer: BertTokenizer,
                 object_features_variant=False,
                 positional_embed_variant=False,
                 latent_transformer=False):
        super().__init__()

        self.args = args
        self.tokenizer = tokenizer

        self.image_projection = nn.Sequential(
            nn.Linear(512, 768), nn.BatchNorm1d(768, momentum=0.01))

        config = BertConfig.from_pretrained('bert-base-uncased')
        self.tokenizer = tokenizer
        self.embeddings = BertEmbeddings(config)

        self.text_encoder = BertModel.from_pretrained("bert-base-uncased",
                                                      return_dict=True)
        self.decoder = BertLMHeadModel.from_pretrained(
            'bert-base-uncased',
            is_decoder=True,
            use_cache=True,
            add_cross_attention=True)

        if object_features_variant:
            self.image_transformer = ImageTransformerEncoder(args)

        self.positional_embed = True if positional_embed_variant else False

        self.latent_transformer = latent_transformer
Ejemplo n.º 8
0
 def build_model(self):
     '''
     Builds BERT NER model layers
         Arguments:
             None
         Returns:
             None
     '''
     # set seeds if a seed was provided
     if self.seed:
         torch.manual_seed(self.seed)
         torch.cuda.manual_seed(self.seed)
         np.random.seed(self.seed)
     # initialize BERT model from file
     self.bert = BertModel(self.config).from_pretrained(self.model_file)
     # dropout layer for bert output
     self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
     # dense classification layer
     self.classifier = nn.Linear(self.config.hidden_size, len(self.classes))
     # CRF output layer
     self.crf = CRF(classes=self.classes,
                    scheme=self.scheme,
                    batch_first=True)
     # initialize CRF with seed
     self.crf.initialize(self.seed)
Ejemplo n.º 9
0
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, BertModel],
        embedding_dropout: float = 0.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        label_smoothing: float = None,
        ignore_span_metric: bool = False,
        srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        if isinstance(bert_model, str):
            self.bert_model = BertModel.from_pretrained(bert_model)
        else:
            self.bert_model = bert_model

        self.num_classes = self.vocab.get_vocab_size("labels")
        if srl_eval_path is not None:
            # For the span based evaluation, we don't want to consider labels
            # for verb, because the verb index is provided to the model.
            self.span_metric = SrlEvalScorer(srl_eval_path,
                                             ignore_classes=["V"])
        else:
            self.span_metric = None
        self.tag_projection_layer = Linear(self.bert_model.config.hidden_size,
                                           self.num_classes)

        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        initializer(self)
Ejemplo n.º 10
0
 def __init__(self, config):
     super(Bert_CRF, self).__init__(config)
     
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.crf = CRF(num_tags=config.num_labels, batch_first=True)
     
     self.init_weights()
Ejemplo n.º 11
0
    def __init__(self, config, *model_args, **model_kargs):
        super().__init__(config)
        self.model_args = model_kargs["model_args"]
        self.bert = BertModel(config)

        if self.model_args.do_mlm:
            self.lm_head = BertLMPredictionHead(config)

        cl_init(self, config)
Ejemplo n.º 12
0
 def __init__(self, config):
     super(BertForMultiLable, self).__init__(config)
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.init_weights()
     self.decoder = seq_model.rnn_decoder(config)
     self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD, reduction='none')
     self.softmax = nn.Softmax(dim=2)
Ejemplo n.º 13
0
    def __init__(self, config, weight=None):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        self.weight = weight

        self.init_weights()
Ejemplo n.º 14
0
    def __init__(self, config, head2size):
        super(MultiHeadModel, self).__init__(config, head2size)
        config.num_labels = 1
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        module_dict = {}
        for head_name, num_labels in head2size.items():
            module_dict[head_name] = nn.Linear(config.hidden_size, num_labels)
        self.heads = nn.ModuleDict(module_dict)

        self.init_weights()
Ejemplo n.º 15
0
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, Dict[str, Any], BertModel],
        embedding_dropout: float = 0.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        label_smoothing: float = None,
        ignore_span_metric: bool = False,
        srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        if isinstance(bert_model, str):
            self.bert_model = BertModel.from_pretrained(bert_model)
        elif isinstance(bert_model, dict):
            warnings.warn(
                "Initializing BertModel without pretrained weights. This is fine if you're loading "
                "from an AllenNLP archive, but not if you're training.",
                UserWarning,
            )
            bert_config = BertConfig.from_dict(bert_model)
            self.bert_model = BertModel(bert_config)
        else:
            self.bert_model = bert_model

        self.num_classes = self.vocab.get_vocab_size("labels")
        if srl_eval_path is not None:
            # For the span based evaluation, we don't want to consider labels
            # for verb, because the verb index is provided to the model.
            self.span_metric = SrlEvalScorer(srl_eval_path,
                                             ignore_classes=["V"])
        else:
            self.span_metric = None
        self.tag_projection_layer = Linear(self.bert_model.config.hidden_size,
                                           self.num_classes)

        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        initializer(self)
Ejemplo n.º 16
0
def main():
    if len(sys.argv) != 3:
        print(
            "Usage: \n"
            "    convert_huggingface_bert_to_npz model_name (bert-base-uncased) output_file"
        )
        exit(0)
    torch.set_grad_enabled(False)

    model_name = sys.argv[1]
    model = BertModel.from_pretrained(model_name)
    arrays = {k: v.detach() for k, v in model.named_parameters()}

    q_weight_key = 'self.query.weight'
    k_weight_key = 'self.key.weight'
    v_weight_key = 'self.value.weight'

    q_bias_key = 'self.query.bias'
    k_bias_key = 'self.key.bias'
    v_bias_key = 'self.value.bias'

    numpy_dict = {}
    for k in arrays.keys():
        if k.endswith(q_weight_key):
            v = torch.clone(
                torch.t(
                    torch.cat([
                        arrays[k],
                        arrays[k[:-len(q_weight_key)] + k_weight_key],
                        arrays[k[:-len(q_weight_key)] + v_weight_key]
                    ], 0).contiguous()).contiguous())
            numpy_dict[k[:-len(q_weight_key)] + "qkv.weight"] = v.numpy()
        elif k.endswith(q_bias_key):
            v = torch.cat([
                arrays[k], arrays[k[:-len(q_bias_key)] + k_bias_key],
                arrays[k[:-len(q_bias_key)] + v_bias_key]
            ], 0).numpy()
            numpy_dict[k[:-len(q_bias_key)] + 'qkv.bias'] = v
        elif any((k.endswith(suffix) for suffix in (k_weight_key, v_weight_key,
                                                    k_bias_key, v_bias_key))):
            continue
        elif (k.endswith("attention.output.dense.weight")
              or k.endswith("pooler.dense.weight")
              or (k.endswith("output.dense.weight")
                  or k.endswith("intermediate.dense.weight"))):
            numpy_dict[k] = torch.clone(torch.t(
                arrays[k]).contiguous()).numpy()
        else:
            numpy_dict[k] = arrays[k].numpy()
    del arrays
    del model
    numpy.savez_compressed(sys.argv[2], **numpy_dict)
Ejemplo n.º 17
0
    def __init__(self,
                 bert_model_name,
                 trainable=False,
                 output_size=0,
                 activation=gelu,
                 dropout=0.0):
        """Initializes pertrained `BERT` model

        Arguments:
            bert_model_name {str} -- bert model name

        Keyword Arguments:
            output_size {float} -- output size (default: {None})
            activation {nn.Module} -- activation function (default: {gelu})
            dropout {float} -- dropout rate (default: {0.0})
        """

        super().__init__()
        self.bert_model = BertModel.from_pretrained(bert_model_name,
                                                    output_attentions=True,
                                                    output_hidden_states=True)
        logger.info("Load bert model {} successfully.".format(bert_model_name))

        self.output_size = output_size

        if trainable:
            logger.info(
                "Start fine-tuning bert model {}.".format(bert_model_name))
        else:
            logger.info("Keep fixed bert model {}.".format(bert_model_name))

        for param in self.bert_model.parameters():
            param.requires_grad = trainable

        if self.output_size > 0:
            self.mlp = BertLinear(
                input_size=self.bert_model.config.hidden_size,
                output_size=self.output_size,
                activation=activation)
        else:
            self.output_size = self.bert_model.config.hidden_size
            self.mlp = lambda x: x

        if dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = lambda x: x
Ejemplo n.º 18
0
    def __init__(self, config):
        super(Bert_BiLSTM_CRF, self).__init__(config)
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear = nn.Linear(config.hidden_size, config.num_labels)
        self.bilstm = nn.LSTM(
            config.hidden_size, 
            (config.hidden_size) // 2, 
            dropout=config.hidden_dropout_prob, 
            batch_first=True, 
            bidirectional=True,
            num_layers=2,
        )
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)

        self.init_weights()
Ejemplo n.º 19
0
    def setup_method(self):

        self.monkeypatch = MonkeyPatch()
        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json"
        vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt"
        config = BertConfig.from_json_file(config_path)
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
        self.monkeypatch.setattr(
            BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path)
        )

        super().setup_method()
        self.set_up_model(
            FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet",
            FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012",
        )
        def init_bert_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch(
                self.torch_model)
Ejemplo n.º 21
0
    def __init__(
        self,
        vocab: Vocabulary,
        embedding_dim: int,
        feedforward_dim: int,
        num_layers: int,
        num_attention_heads: int,
        position_embedding_dim: int,
        tokenizer_path: str,
        position_embedding_type: str = "absolute",
        activation: str = "gelu",
        hidden_dropout: float = 0.1,
    ) -> None:
        super().__init__()
        # TODO:
        # - Need to apply corrections in pretrained_transformer_mismatched_embedder

        tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        vocab.add_transformer_vocab(tokenizer, "tokens")
        # "tokens" is padded by default--undo that
        del vocab._token_to_index["tokens"]["@@PADDING@@"]
        del vocab._token_to_index["tokens"]["@@UNKNOWN@@"]
        assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"])

        cfg = BertConfig(
            vocab_size=vocab.get_vocab_size("tokens"),
            hidden_size=embedding_dim,
            num_hidden_layers=num_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=feedforward_dim,
            hidden_act=activation,
            hidden_dropout_prob=hidden_dropout,
            max_position_embeddings=position_embedding_dim,
            position_embedding_type=position_embedding_type,
            use_cache=True,
        )
        self.cfg = cfg
        self._vocab = vocab
        self._namespace = "tokens"
        self.bert = BertModel(cfg)
        self.masking_collator = DataCollatorForWholeWordMask(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        )
Ejemplo n.º 22
0
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointMBERT, self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.bert = BertModel(config=config)  # Load pretrained bert

        self.intent_classifier = IntentClassifier(config.hidden_size,
                                                  self.num_intent_labels,
                                                  args.dropout_rate)
        self.slot_classifier = SlotClassifier(
            config.hidden_size, self.num_intent_labels, self.num_slot_labels,
            self.args.use_intent_context_concat,
            self.args.use_intent_context_attention, self.args.max_seq_len,
            self.args.intent_embedding_size,
            self.args.attention_embedding_size, self.args.attention_type,
            args.dropout_rate)

        if args.use_crf:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
Ejemplo n.º 23
0
    def __init__(self, config):
        """
        Init function that initializes the model.
        Inputs:
            config - Configuration of the model
        """

        super().__init__(config)
        self.num_labels = config.num_labels
        self.num_labels_list = [config.num_labels]
        self.hidden_size = config.hidden_size

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # create a list of classifiers
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        self.classifiers = [self.classifier]

        self.init_weights()
Ejemplo n.º 24
0
    def __init__(self,
                 name_or_path_or_model: Union[str, BertModel],
                 adapter_size: int = 64,
                 adapter_num: int = 12,
                 external_param: Union[bool, List[bool]] = False,
                 **kwargs):
        super().__init__()
        if isinstance(name_or_path_or_model, str):
            self.bert = BertModel.from_pretrained(name_or_path_or_model)
        else:
            self.bert = name_or_path_or_model

        set_requires_grad(self.bert, False)

        if isinstance(external_param, bool):
            param_place = [external_param for _ in range(adapter_num)]
        elif isinstance(external_param, list):
            param_place = [False for _ in range(adapter_num)]
            for i, e in enumerate(external_param, 1):
                param_place[-i] = e
        else:
            raise ValueError("wrong type of external_param!")

        self.adapters = nn.ModuleList([
            nn.ModuleList([
                Adapter(self.bert.config.hidden_size, adapter_size,
                        param_place[i]),
                Adapter(self.bert.config.hidden_size, adapter_size,
                        param_place[i])
            ]) for i in range(adapter_num)
        ])

        for i, adapters in enumerate(self.adapters, 1):
            layer = self.bert.encoder.layer[-i]
            layer.output = AdapterBertOutput(layer.output, adapters[0].forward)
            set_requires_grad(layer.output.base.LayerNorm, True)
            layer.attention.output = AdapterBertOutput(layer.attention.output,
                                                       adapters[1].forward)
            set_requires_grad(layer.attention.output.base.LayerNorm, True)

        self.output_dim = self.bert.config.hidden_size
    def test_from_pytorch(self):
        with torch.no_grad():
            with self.subTest("bert-base-cased"):
                tokenizer = BertTokenizerFast.from_pretrained(
                    "bert-base-cased")
                fx_model = FlaxBertModel.from_pretrained("bert-base-cased")
                pt_model = BertModel.from_pretrained("bert-base-cased")

                # Check for simple input
                pt_inputs = tokenizer.encode_plus(
                    "This is a simple input",
                    return_tensors=TensorType.PYTORCH)
                fx_inputs = tokenizer.encode_plus(
                    "This is a simple input", return_tensors=TensorType.JAX)
                pt_outputs = pt_model(**pt_inputs).to_tuple()
                fx_outputs = fx_model(**fx_inputs)

                self.assertEqual(
                    len(fx_outputs), len(pt_outputs),
                    "Output lengths differ between Flax and PyTorch")

                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
                    self.assert_almost_equals(fx_output, pt_output.numpy(),
                                              5e-3)
Ejemplo n.º 26
0
def test_fused_softmax():
    from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes
    from megatron.model.gpt2_model import (
        gpt2_attention_mask_func as attention_mask_func, )

    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    test_text = (
        "Hello. How are you? I am fine thank you and you? yes Good. "
        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
    )

    tokens = tokenizer(
        [test_text] * 4,
        return_tensors="pt",
    )

    embedding_output = bert.embeddings(
        input_ids=tokens["input_ids"].cuda(),
        position_ids=None,
        token_type_ids=tokens["token_type_ids"].cuda(),
        inputs_embeds=None,
        past_key_values_length=0,
    )

    # (bsz, 1, 1, seq_len)
    mask = bert.get_extended_attention_mask(
        attention_mask=tokens["attention_mask"].cuda(),
        input_shape=tokens["input_ids"].shape,
        device=bert.device,
    )
    # (bsz, 1, seq_len, seq_len)
    mask = mask.repeat(1, 1, mask.size()[-1], 1)

    attention = bert.encoder.layer[0].attention.self
    key_layer = attention.transpose_for_scores(attention.key(embedding_output))
    query_layer = attention.transpose_for_scores(
        attention.query(embedding_output))

    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
    attention_scores /= math.sqrt(key_layer.size()[-1])

    fused_softmax = (FusedScaleMaskSoftmax(
        input_in_fp16=True,
        input_in_bf16=False,
        fusion_type=SoftmaxFusionTypes.general,
        mask_func=attention_mask_func,
        scale=None,
        softmax_in_fp32=False,
    ).cuda().half())

    fused_softmax_output = fused_softmax(
        attention_scores,
        (mask != 0),
    )

    torch_softmax = (FusedScaleMaskSoftmax(
        input_in_fp16=True,
        input_in_bf16=False,
        mask_func=attention_mask_func,
        fusion_type=SoftmaxFusionTypes.none,
        scale=None,
        softmax_in_fp32=False,
    ).cuda().half())

    torch_softmax_output = torch_softmax(
        attention_scores,
        (mask != 0),
    )

    test_result = (fused_softmax_output - torch_softmax_output).abs()

    while test_result.dim() != 1:
        test_result = test_result.mean(dim=-1)

    diff = test_result.mean(dim=-1)

    if diff <= 1e-3:
        print(
            f"\n[Success] test_fused_softmax"
            f"\n > mean_difference={diff}"
            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
        )
    else:
        print(
            f"\n[Fail] test_fused_softmax"
            f"\n > mean_difference={diff}, "
            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
        )
Ejemplo n.º 27
0
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")

    def check_torch_and_turbo(self,
                              use_cuda,
                              batch_size,
                              seq_len,
                              use_memory_opt=True):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                input_ids.size()[0],  # batch
                input_ids.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in input_ids.device.type else "CPU")

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        print(f"batch {batch_size} seq_len {seq_len}")
        print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu()))
        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-2,
                           rtol=1e-3))

    def bert_model_test_helper(self, use_memory_opt=False):
        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("model-aware")

        for batch_size in [2, 4, 1]:
            for seq_len in [50, 4, 16]:
                if torch.cuda.is_available() and \
                        turbo_transformers.config.is_compiled_with_cuda():
                    self.check_torch_and_turbo(use_cuda=True,
                                               batch_size=batch_size,
                                               seq_len=seq_len,
                                               use_memory_opt=use_memory_opt)
                self.check_torch_and_turbo(use_cuda=False,
                                           batch_size=batch_size,
                                           seq_len=seq_len,
                                           use_memory_opt=use_memory_opt)

        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("naive")

    def test_bert_model(self):
        # self.bert_model_test_helper(True)
        self.bert_model_test_helper(False)
Ejemplo n.º 28
0
####################
# Helper functions #
####################
def process(t):
    return torch.stack(t).squeeze().detach().numpy()


# BERT
if __name__ == "__main__":
    excluded_neurons = {0: (0, ), 1: (0, 1), 2: (0, 1, 2)}
    model = BertModel.from_pretrained("bert-base-cased",
                                      output_attentions=True,
                                      output_values=True,
                                      output_dense=True,
                                      output_mlp_activations=True,
                                      output_q_activations=True,
                                      output_k_activations=True,
                                      output_v_activations=True,
                                      excluded_neurons=excluded_neurons)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    inputs = tokenizer("Hello", return_tensors="pt")
    print("### inputs ###")
    print(inputs.items())
    outputs = model(**inputs)
    print("### values ###")
    print(len(outputs["values"]))
    print(outputs["values"][0].shape)
    values = outputs["values"]
    values = torch.stack(values).squeeze()
    values = values.detach().numpy()
Ejemplo n.º 29
0
def main(config):
    args = config

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = ATEPCProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    datasets = {
        'camera': "atepc_datasets/camera",
        'car': "atepc_datasets/car",
        'phone': "atepc_datasets/phone",
        'notebook': "atepc_datasets/notebook",
        'laptop': "atepc_datasets/laptop",
        'restaurant': "atepc_datasets/restaurant",
        'twitter': "atepc_datasets/twitter",
        'mixed': "atepc_datasets/mixed",
    }
    pretrained_bert_models = {
        'camera': "bert-base-chinese",
        'car': "bert-base-chinese",
        'phone': "bert-base-chinese",
        'notebook': "bert-base-chinese",
        'laptop': "bert-base-uncased",
        'restaurant': "bert-base-uncased",
        # for loading domain-adapted BERT
        # 'restaurant': "../bert_pretrained_restaurant",
        'twitter': "bert-base-uncased",
        'mixed': "bert-base-multilingual-uncased",
    }

    args.bert_model = pretrained_bert_models[args.dataset]
    args.data_dir = datasets[args.dataset]

    def convert_polarity(examples):
        for i in range(len(examples)):
            polarities = []
            for polarity in examples[i].polarity:
                if polarity == 2:
                    polarities.append(1)
                else:
                    polarities.append(polarity)
            examples[i].polarity = polarities

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=True)
    train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_test_examples(args.data_dir)
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    bert_base_model = BertModel.from_pretrained(args.bert_model)
    bert_base_model.config.num_labels = num_labels

    if args.dataset in {'camera', 'car', 'phone', 'notebook'}:
        convert_polarity(train_examples)
        convert_polarity(eval_examples)
        model = LCF_ATEPC(bert_base_model, args=args)
    else:
        model = LCF_ATEPC(bert_base_model, args=args)

    for arg in vars(args):
        logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg)))

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.00001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.00001
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      weight_decay=0.00001)
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)
    all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features],
                                     dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
    all_polarities = torch.tensor([f.polarities for f in eval_features],
                                  dtype=torch.long)
    all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                 dtype=torch.long)
    all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                 dtype=torch.long)
    eval_data = TensorDataset(all_spc_input_ids, all_input_mask,
                              all_segment_ids, all_label_ids, all_polarities,
                              all_valid_ids, all_lmask_ids)
    # Run prediction for full data
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    def evaluate(eval_ATE=True, eval_APC=True):
        # evaluate
        apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0}
        ate_result = 0
        y_true = []
        y_pred = []
        n_test_correct, n_test_total = 0, 0
        test_apc_logits_all, test_polarities_all = None, None
        model.eval()
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader:
            input_ids_spc = input_ids_spc.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            polarities = polarities.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                ate_logits, apc_logits = model(input_ids_spc,
                                               segment_ids,
                                               input_mask,
                                               valid_ids=valid_ids,
                                               polarities=polarities,
                                               attention_mask_label=l_mask)
            if eval_APC:
                polarities = model.get_batch_polarities(polarities)
                n_test_correct += (torch.argmax(
                    apc_logits, -1) == polarities).sum().item()
                n_test_total += len(polarities)

                if test_polarities_all is None:
                    test_polarities_all = polarities
                    test_apc_logits_all = apc_logits
                else:
                    test_polarities_all = torch.cat(
                        (test_polarities_all, polarities), dim=0)
                    test_apc_logits_all = torch.cat(
                        (test_apc_logits_all, apc_logits), dim=0)

            if eval_ATE:
                if not args.use_bert_spc:
                    label_ids = model.get_batch_token_labels_bert_base_indices(
                        label_ids)
                ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2),
                                          dim=2)
                ate_logits = ate_logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                input_mask = input_mask.to('cpu').numpy()
                for i, label in enumerate(label_ids):
                    temp_1 = []
                    temp_2 = []
                    for j, m in enumerate(label):
                        if j == 0:
                            continue
                        elif label_ids[i][j] == len(label_list):
                            y_true.append(temp_1)
                            y_pred.append(temp_2)
                            break
                        else:
                            temp_1.append(label_map.get(label_ids[i][j], 'O'))
                            temp_2.append(label_map.get(ate_logits[i][j], 'O'))
        if eval_APC:
            test_acc = n_test_correct / n_test_total
            if args.dataset in {'camera', 'car', 'phone', 'notebook'}:
                test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(),
                                   test_polarities_all.cpu(),
                                   labels=[0, 1],
                                   average='macro')
            else:
                test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(),
                                   test_polarities_all.cpu(),
                                   labels=[0, 1, 2],
                                   average='macro')
            test_acc = round(test_acc * 100, 2)
            test_f1 = round(test_f1 * 100, 2)
            apc_result = {
                'max_apc_test_acc': test_acc,
                'max_apc_test_f1': test_f1
            }

        if eval_ATE:
            report = classification_report(y_true, y_pred, digits=4)
            tmps = report.split()
            ate_result = round(float(tmps[7]) * 100, 2)
        return apc_result, ate_result

    def save_model(path):
        # Save a trained model and the associated configuration,
        # Take care of the storage!
        os.makedirs(path, exist_ok=True)
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(path)
        tokenizer.save_pretrained(path)
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": True,
            "max_seq_length": args.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(model_config, open(os.path.join(path, "config.json"), "w"))
        logger.info('save model to: {}'.format(path))

    def train():
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_spc_input_ids = torch.tensor(
            [f.input_ids_spc for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        all_polarities = torch.tensor([f.polarities for f in train_features],
                                      dtype=torch.long)
        train_data = TensorDataset(all_spc_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_polarities, all_valid_ids,
                                   all_lmask_ids)

        train_sampler = SequentialSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        max_apc_test_acc = 0
        max_apc_test_f1 = 0
        max_ate_test_f1 = 0

        global_step = 0
        for epoch in range(int(args.num_train_epochs)):
            logger.info('#' * 80)
            logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1,
                                                  args.data_dir))
            logger.info('#' * 80)
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch
                loss_ate, loss_apc = model(input_ids_spc, segment_ids,
                                           input_mask, label_ids, polarities,
                                           valid_ids, l_mask)
                loss = loss_ate + loss_apc
                loss.backward()
                nb_tr_examples += input_ids_spc.size(0)
                nb_tr_steps += 1
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                if global_step % args.eval_steps == 0:
                    if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2:
                        # evaluate in last 2 epochs
                        apc_result, ate_result = evaluate(
                            eval_ATE=not args.use_bert_spc)

                        # apc_result, ate_result = evaluate()
                        # path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format(
                        #     args.output_dir,
                        #     args.dataset,
                        #     args.local_context_focus,
                        #     round(apc_result['max_apc_test_acc'], 2),
                        #     round(apc_result['max_apc_test_f1'], 2),
                        #     round(ate_result, 2)
                        # )
                        # if apc_result['max_apc_test_acc'] > max_apc_test_acc or \
                        #     apc_result['max_apc_test_f1'] > max_apc_test_f1 or \
                        #     ate_result > max_ate_test_f1:
                        #     save_model(path)

                        if apc_result['max_apc_test_acc'] > max_apc_test_acc:
                            max_apc_test_acc = apc_result['max_apc_test_acc']
                        if apc_result['max_apc_test_f1'] > max_apc_test_f1:
                            max_apc_test_f1 = apc_result['max_apc_test_f1']
                        if ate_result > max_ate_test_f1:
                            max_ate_test_f1 = ate_result

                        current_apc_test_acc = apc_result['max_apc_test_acc']
                        current_apc_test_f1 = apc_result['max_apc_test_f1']
                        current_ate_test_f1 = round(ate_result, 2)

                        logger.info('*' * 80)
                        logger.info('Train {} Epoch{}, Evaluate for {}'.format(
                            args.seed, epoch + 1, args.data_dir))
                        logger.info(
                            f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc})  '
                            f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})'
                        )
                        if args.use_bert_spc:
                            logger.info(
                                f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})'
                                f' (Unreliable since `use_bert_spc` is "True".)'
                            )
                        else:
                            logger.info(
                                f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})'
                            )
                        logger.info('*' * 80)

        return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1]

    return train()
Ejemplo n.º 30
0
 def __init__(self, config, weight=None):
     super(BertForTextRepresentation, self).__init__(config)
     self.bert = BertModel(config)
     self.weight = weight
     self.init_weights()