コード例 #1
0
        def init_data(self, use_cuda: bool) -> None:
            test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_bert_layer = BertLayer(self.cfg)
            self.torch_bert_layer.eval()
            if use_cuda:
                self.torch_bert_layer.to(test_device)

            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.rand(size=(batch_size, seq_length,
                                                 self.hidden_size),
                                           dtype=torch.float32,
                                           device=test_device)

            self.attention_mask = torch.ones((batch_size, seq_length),
                                             dtype=torch.float32,
                                             device=test_device)
            self.attention_mask = self.attention_mask[:, None, None, :]
            self.attention_mask = (1.0 - self.attention_mask) * -10000.0

            self.turbo_bert_layer = turbo_transformers.BertLayer.from_torch(
                self.torch_bert_layer)
コード例 #2
0
 def __init__(self, config, scc_n_layer=6):
     super(BertEncoder, self).__init__()
     self.prd_n_layer = config.num_hidden_layers
     self.scc_n_layer = scc_n_layer
     assert self.prd_n_layer % self.scc_n_layer == 0
     self.compress_ratio = self.prd_n_layer // self.scc_n_layer
     self.bernoulli = None
     self.output_attentions = config.output_attentions
     self.output_hidden_states = config.output_hidden_states
     self.layer = nn.ModuleList([BertLayer(config) for _ in range(self.prd_n_layer)])
     self.scc_layer = nn.ModuleList([BertLayer(config) for _ in range(self.scc_n_layer)])
コード例 #3
0
    def __init__(self, aggregation_method='transformer'):
        super().__init__()

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased").cuda()

        self.transformer_layer_1 = BertLayer(self.bert.config).cuda()
        self.transformer_layer_2 = BertLayer(self.bert.config).cuda()
        self.num_passages = 4
        self.maxseqlen = 0
        self.linear = nn.Linear(self.bert.config.hidden_size, 1).cuda()

        if aggregation_method == "maxp":
            self.aggregation = self.aggregate_using_maxp
        elif aggregation_method == "transformer":
            self.aggregation = self.aggregate_using_transformer
            input_embeddings = self.bert.get_input_embeddings()
            cls_token_id = torch.tensor([[101]]).cuda()
            self.initial_cls_embedding = input_embeddings(cls_token_id).view(
                1, self.bert.config.hidden_size)

            #self.full_position_embeddings = torch.zeros(
            #    (1, self.num_passages + 1, self.bert.config.hidden_size), requires_grad=True, dtype=torch.float
            #).cuda()
            #torch.nn.init.normal_(self.full_position_embeddings, mean=0.0, std=0.02)

            # AIAYN embeddings
            def get_position_angle_vec(position, d_hid):
                return [
                    position / np.power(10000, 2 * (hid_j // 2) / d_hid)
                    for hid_j in range(d_hid)
                ]

            sinusoid_table = np.array([
                get_position_angle_vec(pos_i, self.bert.config.hidden_size)
                for pos_i in range(100)
            ])
            sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
            sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:,
                                                            1::2])  # dim 2i+1

            self.initial_cls_embedding = nn.Parameter(
                self.initial_cls_embedding, requires_grad=True)
            self.full_position_embeddings = nn.Parameter(
                torch.FloatTensor(sinusoid_table).unsqueeze(0))
        elif aggregation_method == 'average':
            self.aggregation = self.aggregate_using_avg
        else:
            raise NotImplementedError()
    def __init__(self, config):
        super(RobertaLMHead, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = BertLayer(config.hidden_size, eps=config.layer_norm_eps)

        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
コード例 #5
0
    def __init__(self, config):
        super().__init__()
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList(
            [BertLayer(config) for _ in range(config.num_hidden_layers)])
        self.highway = nn.ModuleList(
            [BertHighway(config) for _ in range(config.num_hidden_layers)])

        self.early_exit_entropy = [-1 for _ in range(config.num_hidden_layers)]
コード例 #6
0
    def __init__(self, bert: BertModel, model_args: ModelArguments,
                 data_args: DataTrainingArguments,
                 train_args: TrainingArguments):
        super(CondenserForPretraining, self).__init__()
        self.lm = bert
        self.c_head = nn.ModuleList(
            [BertLayer(bert.config) for _ in range(model_args.n_head_layers)])
        self.c_head.apply(self.lm._init_weights)
        self.cross_entropy = nn.CrossEntropyLoss()

        self.model_args = model_args
        self.train_args = train_args
        self.data_args = data_args
コード例 #7
0
def get_layer_modules():
    params = copy.deepcopy(LAYER_PARAMS_DICT)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")
    params["hidden_act"] = params.pop("activation")

    torch.manual_seed(1234)
    yield "bert", BertLayer(BertConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "roberta", RobertaLayer(RobertaConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "electra", ElectraLayer(ElectraConfig(**params)).eval()
コード例 #8
0
    def __init__(self, config: BertConfig, num_hidden_layers=None):
        super().__init__()
        self.logger = get_logger(__name__)
        config.output_hidden_states = True
        self.embeddings = BertEmbeddings(config)
        num_hidden_layers = config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers
        assert num_hidden_layers > 0, 'bert_layers must > 0'

        # 需要注意的是和原始transformer的BERT_Encoder的输出不一样
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states

        layer = BertLayer(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])
        self.config = config
        self.num_hidden_layers = num_hidden_layers
        self.apply(self.init_bert_weights)
コード例 #9
0
        def init_bertlayer_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertLayer(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertLayerSmartBatch.from_torch(
                self.torch_model)
コード例 #10
0
def get_layer_modules(params_dict):
    modules = {}
    params = copy.deepcopy(params_dict)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")

    # bert, roberta, electra, layoutlm self attentions have the same code.

    torch.manual_seed(1234)
    hf_module = BertLayer(BertConfig(**params))
    modules["bert"] = hf_module

    torch.manual_seed(1234)
    hf_module = RobertaLayer(RobertaConfig(**params))
    modules["roberta"] = hf_module

    torch.manual_seed(1234)
    hf_module = ElectraLayer(ElectraConfig(**params))
    modules["electra"] = hf_module

    return modules
コード例 #11
0
 def __init__(self, config):
     super().__init__()
     self.config = config
     self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
コード例 #12
0
    class TestBertLayer(unittest.TestCase):
        def init_data(self, use_cuda: bool) -> None:
            test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_bert_layer = BertLayer(self.cfg)
            self.torch_bert_layer.eval()
            if use_cuda:
                self.torch_bert_layer.to(test_device)

            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.rand(size=(batch_size, seq_length,
                                                 self.hidden_size),
                                           dtype=torch.float32,
                                           device=test_device)

            self.attention_mask = torch.ones((batch_size, seq_length),
                                             dtype=torch.float32,
                                             device=test_device)
            self.attention_mask = self.attention_mask[:, None, None, :]
            self.attention_mask = (1.0 - self.attention_mask) * -10000.0

            self.turbo_bert_layer = turbo_transformers.BertLayer.from_torch(
                self.torch_bert_layer)

        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda)
            num_iter = 2
            device = "GPU" if use_cuda else "CPU"
            torch_model = lambda: self.torch_bert_layer(
                self.input_tensor, self.attention_mask, output_attentions=True)
            torch_bert_layer_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(f"BertLayer \"({batch_size},{seq_length:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            turbo_model = lambda: self.turbo_bert_layer(
                self.input_tensor, self.attention_mask, output_attentions=True)
            turbo_bert_layer_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
            print(
                f"BertLayer \"({batch_size},{seq_length:03})\"  ",
                f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}"
            )

            # Tensor core will introduce more errors
            tolerate_error = 1e-2 if use_cuda else 1e-3
            self.assertTrue(
                torch.max(
                    torch.abs(torch_bert_layer_result[0] -
                              turbo_bert_layer_result[0])) < tolerate_error)

            # self.assertTrue(
            #     torch.max(
            #         torch.abs(torch_bert_layer_result[1] -
            #                   turbo_bert_layer_result[1])) < tolerate_error)

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
                )

        def test_bert_layer(self):
            self.check_torch_and_turbo(use_cuda=False)
            if torch.cuda.is_available() and \
                turbo_transformers.config.is_compiled_with_cuda():
                self.check_torch_and_turbo(use_cuda=True)