def init_data(self, use_cuda: bool) -> None: test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_bert_layer = BertLayer(self.cfg) self.torch_bert_layer.eval() if use_cuda: self.torch_bert_layer.to(test_device) self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(batch_size, seq_length, self.hidden_size), dtype=torch.float32, device=test_device) self.attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_bert_layer = turbo_transformers.BertLayer.from_torch( self.torch_bert_layer)
def __init__(self, config, scc_n_layer=6): super(BertEncoder, self).__init__() self.prd_n_layer = config.num_hidden_layers self.scc_n_layer = scc_n_layer assert self.prd_n_layer % self.scc_n_layer == 0 self.compress_ratio = self.prd_n_layer // self.scc_n_layer self.bernoulli = None self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(self.prd_n_layer)]) self.scc_layer = nn.ModuleList([BertLayer(config) for _ in range(self.scc_n_layer)])
def __init__(self, aggregation_method='transformer'): super().__init__() self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.bert = BertModel.from_pretrained("bert-base-uncased").cuda() self.transformer_layer_1 = BertLayer(self.bert.config).cuda() self.transformer_layer_2 = BertLayer(self.bert.config).cuda() self.num_passages = 4 self.maxseqlen = 0 self.linear = nn.Linear(self.bert.config.hidden_size, 1).cuda() if aggregation_method == "maxp": self.aggregation = self.aggregate_using_maxp elif aggregation_method == "transformer": self.aggregation = self.aggregate_using_transformer input_embeddings = self.bert.get_input_embeddings() cls_token_id = torch.tensor([[101]]).cuda() self.initial_cls_embedding = input_embeddings(cls_token_id).view( 1, self.bert.config.hidden_size) #self.full_position_embeddings = torch.zeros( # (1, self.num_passages + 1, self.bert.config.hidden_size), requires_grad=True, dtype=torch.float #).cuda() #torch.nn.init.normal_(self.full_position_embeddings, mean=0.0, std=0.02) # AIAYN embeddings def get_position_angle_vec(position, d_hid): return [ position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid) ] sinusoid_table = np.array([ get_position_angle_vec(pos_i, self.bert.config.hidden_size) for pos_i in range(100) ]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 self.initial_cls_embedding = nn.Parameter( self.initial_cls_embedding, requires_grad=True) self.full_position_embeddings = nn.Parameter( torch.FloatTensor(sinusoid_table).unsqueeze(0)) elif aggregation_method == 'average': self.aggregation = self.aggregate_using_avg else: raise NotImplementedError()
def __init__(self, config): super(RobertaLMHead, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayer(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
def __init__(self, config): super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList( [BertLayer(config) for _ in range(config.num_hidden_layers)]) self.highway = nn.ModuleList( [BertHighway(config) for _ in range(config.num_hidden_layers)]) self.early_exit_entropy = [-1 for _ in range(config.num_hidden_layers)]
def __init__(self, bert: BertModel, model_args: ModelArguments, data_args: DataTrainingArguments, train_args: TrainingArguments): super(CondenserForPretraining, self).__init__() self.lm = bert self.c_head = nn.ModuleList( [BertLayer(bert.config) for _ in range(model_args.n_head_layers)]) self.c_head.apply(self.lm._init_weights) self.cross_entropy = nn.CrossEntropyLoss() self.model_args = model_args self.train_args = train_args self.data_args = data_args
def get_layer_modules(): params = copy.deepcopy(LAYER_PARAMS_DICT) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") params["hidden_act"] = params.pop("activation") torch.manual_seed(1234) yield "bert", BertLayer(BertConfig(**params)).eval() torch.manual_seed(1234) yield "roberta", RobertaLayer(RobertaConfig(**params)).eval() torch.manual_seed(1234) yield "electra", ElectraLayer(ElectraConfig(**params)).eval()
def __init__(self, config: BertConfig, num_hidden_layers=None): super().__init__() self.logger = get_logger(__name__) config.output_hidden_states = True self.embeddings = BertEmbeddings(config) num_hidden_layers = config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers assert num_hidden_layers > 0, 'bert_layers must > 0' # 需要注意的是和原始transformer的BERT_Encoder的输出不一样 self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states layer = BertLayer(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)]) self.config = config self.num_hidden_layers = num_hidden_layers self.apply(self.init_bert_weights)
def init_bertlayer_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertLayer(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertLayerSmartBatch.from_torch( self.torch_model)
def get_layer_modules(params_dict): modules = {} params = copy.deepcopy(params_dict) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") # bert, roberta, electra, layoutlm self attentions have the same code. torch.manual_seed(1234) hf_module = BertLayer(BertConfig(**params)) modules["bert"] = hf_module torch.manual_seed(1234) hf_module = RobertaLayer(RobertaConfig(**params)) modules["roberta"] = hf_module torch.manual_seed(1234) hf_module = ElectraLayer(ElectraConfig(**params)) modules["electra"] = hf_module return modules
def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
class TestBertLayer(unittest.TestCase): def init_data(self, use_cuda: bool) -> None: test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_bert_layer = BertLayer(self.cfg) self.torch_bert_layer.eval() if use_cuda: self.torch_bert_layer.to(test_device) self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(batch_size, seq_length, self.hidden_size), dtype=torch.float32, device=test_device) self.attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_bert_layer = turbo_transformers.BertLayer.from_torch( self.torch_bert_layer) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 2 device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_bert_layer( self.input_tensor, self.attention_mask, output_attentions=True) torch_bert_layer_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"BertLayer \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_bert_layer( self.input_tensor, self.attention_mask, output_attentions=True) turbo_bert_layer_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) # Tensor core will introduce more errors tolerate_error = 1e-2 if use_cuda else 1e-3 self.assertTrue( torch.max( torch.abs(torch_bert_layer_result[0] - turbo_bert_layer_result[0])) < tolerate_error) # self.assertTrue( # torch.max( # torch.abs(torch_bert_layer_result[1] - # turbo_bert_layer_result[1])) < tolerate_error) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" ) def test_bert_layer(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)