def init_data(self, use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') torch.set_grad_enabled(False) cfg = BertConfig() self.torch_embedding = BertEmbeddings(cfg) self.torch_embedding.eval() if use_cuda: self.torch_embedding.to(test_device) self.turbo_embedding = turbo_transformers.BertEmbeddings.from_torch( self.torch_embedding) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_length), dtype=torch.long, device=test_device) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.repeat(batch_size, 1) token_type_ids = torch.zeros_like(input_ids, dtype=torch.long) return input_ids, position_ids, token_type_ids
class TestBertEmbedding(unittest.TestCase): def init_data(self, use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') torch.set_grad_enabled(False) cfg = BertConfig() self.torch_embedding = BertEmbeddings(cfg) self.torch_embedding.eval() if use_cuda: self.torch_embedding.to(test_device) self.turbo_embedding = turbo_transformers.BertEmbeddings.from_torch( self.torch_embedding) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_length), dtype=torch.long, device=test_device) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.repeat(batch_size, 1) token_type_ids = torch.zeros_like(input_ids, dtype=torch.long) return input_ids, position_ids, token_type_ids def check_torch_and_turbo(self, use_cuda): input_ids, position_ids, token_type_ids = self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 100 torch_model = lambda: self.torch_embedding( input_ids, token_type_ids, position_ids) torch_result, torch_qps, torch_time = test_helper.run_model( torch_model, use_cuda, num_iter) print(f"BertEmbeddings \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_embedding(input_ids, position_ids, token_type_ids) turbo_result, turbo_qps, turbo_time = test_helper.run_model( turbo_model, use_cuda, num_iter) print(f"BertEmbeddings \"({batch_size},{seq_length:03})\" ", f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time}") self.assertTrue( torch.max(torch.abs(torch_result - turbo_result)) < 1e-5) def test_embedding(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)
def __init__(self, config): super(BertModel4Mix, self).__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder4Mix(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super(TtaModel, self).__init__(config) self.embeddings = BertEmbeddings(config) self.position_embedding = PositionEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super().__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()
def __init__(self, config): super(CBOW, self).__init__() self.embeddings = BertEmbeddings(config) self.attention = BertSelfAttention(config) self.act_fn = nn.ReLU() self.linear_1 = nn.Linear(config.hidden_size, config.hidden_size) self.linear_2 = nn.Linear(config.hidden_size, config.hidden_size)
def __init__(self, config): super(TruncBertModel, self).__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = TruncBertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super().__init__(config) # self.config中包含了拼写错误纠正网络Correction_Network中的Bert模型的各种配置超参数. self.config = config '''一、构建错误探查网络Detection_Network中所需的网络层''' # Bi-GRU网络作为错误探查网络Detection_Network的编码器 # 此处由于BertModel中的embeddings层中所有子嵌入模块的嵌入维度都为768, 所以此处Bi-GRU网络的input_size也为768, # 而将Bi-GRU网络的hidden_size设为256,是为了保证Bi-GRU网络双向编码后双向隐藏层拼接到一块后隐藏层维度能保持在512. # 此时enc_hid_size为512. self.enc_bi_gru = torch.nn.GRU(input_size=768, hidden_size=256, dropout=0.2, bidirectional=True) # 双向GRU编码层对于输入错误探查网络Detection_Network中的input_embeddings进行双向编码, # 此时双向GRU编码层的输出为(seq_len, batch_size, enc_hid_size * 2),将其交换维度变形为(batch_size, seq_len, enc_hid_size * 2), # 再将双向GRU编码层的变形后的输出输入self.detection_network_dense_out层中,映射为形状(batch_size, seq_len, 2)的张量, # 这样方便后面进行判断句子序列中每一个字符是否为拼写错误字符的二分类任务的交叉熵损失值计算. self.detection_network_dense_out = torch.nn.Linear(512, 2) # 同时,将双向GRU编码层输出后经过变形的形状为(batch_size, seq_len, enc_hid_size * 2),的张量输入进soft_masking_coef_mapping层中, # 将其形状映射为(batch_size, seq_len, 1)的张量,此张量再在后面输入进Sigmoid()激活函数中, 将此张量的值映射至(0,1)之间, # 这样这个张量即变为了后面计算soft-masked embeddings时和mask_embeddings相乘的系数p (结果pi即可表示为文本序列中第i处的字符拼写错误的似然概率(likelihood)). self.soft_masking_coef_mapping = torch.nn.Linear(512, 1) '''二、构建的拼写错误纠正网络Correction_Network中BertModel中所用的个三种网络层''' ''' (1): 嵌入层BertEmbeddings(),其中包含了每个character的word embedding、segment embeddings、position embedding三种嵌入函数. (2): Bert模型的核心,多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. (3): Bert模型最后的池化层BertPooler. ''' # 嵌入层BertEmbeddings(). self.embeddings = BertEmbeddings(config) # 多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. self.encoder = BertEncoder(config) # 池化层BertPooler。 self.pooler = BertPooler(config) # 初始化权重矩阵,偏置等. self.init_weights() '''获取遮罩特殊符[MASK]在Bert模型的嵌入层BertEmbeddings()中的词嵌入层word_embeddings层中特殊符[MASK]所对应索引的嵌入向量(embeddins vector)''' # 在Bert模型的tokenizer类BertTokenizer()的词表中,遮罩特殊符[MASK]会被编码为索引103(只要是BertTokenizer()类,无论其from_pretrained哪种 # 预训练的Bert模型词表,遮罩特殊符[MASK]在词表中的索引都为103; 除非换预训练模型如换成Albert模型,遮罩特殊符[MASK]在词表中的索引才会变, 否则 # 遮罩特殊符[MASK]在同一类预训练Bert模型的词表下索引不变). # 在之后, 遮罩特殊符[MASK]的张量self.mask_embedding的形状要变为和Bert模型嵌入层BertEmbeddings()的输出input_embeddings张量的形状一样, # 此时self.mask_embeddings张量的形状要为(batch_size, seq_len, embed_size)->(batch_size, seq_len, 768). self.mask_embeddings = self.embeddings.word_embeddings.weight[ 103] # 此时,mask_embedding张量的形状为(768,) # 注意!: 在soft_masked_embeddings输入拼写错误纠正网络correction network中的Bert模型后,其计算结果输入进最终的输出层与Softmax层之前, # 拼写错误纠正网络correction network的结果需通过残差连接residual connection与输入模型一开始的input embeddings相加, # 相加的结果才输入最终的输出层与Softmax层中做最终的正确字符预测。 '''self.soft_masked_bert_dense_out即为拼写错误纠正网络correction network之后的输出层, 其会将经过残差连接模块residual connection之后 的输出的维度由768投影到纠错词表的索引空间. (此处输出层self.soft_masked_bert_dense_out的输出即可被视为Soft_Masked_BERT模型的最终输出)''' self.soft_masked_bert_dense_out = torch.nn.Linear( self.config.hidden_size, self.embeddings.word_embeddings.weight.shape[0]) '''此处可不写最后的Softmax()函数, 因为若之后在训练模型时使用CrossEntropyLoss()交叉熵函数来计算损失值的话, CrossEntropyLoss()函数
def __init__(self, config): super(Stage0, self).__init__() self.embedding_layer = BertEmbeddings(config) self.layers = [] for i in range(config.num_hidden_layers // 24): self.layers.append(BertLayer(config)) self.layers = torch.nn.ModuleList(self.layers) self.config = config self.apply(self.init_bert_weights)
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = DeeBertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__( self, random_init: bool = False, bert_model_name: str = "bert-base-uncased", img_dim: int = 2048, hidden_size: int = 768, hidden_dropout_prob: float = 0, text_embeddings: DictConfig = EMPTY_CONFIG, encoder: DictConfig = EMPTY_CONFIG, ): super().__init__() bert_config = retry_n( NUM_RETRIES, BertConfig.from_pretrained, bert_model_name, **OmegaConf.to_container(text_embeddings), ) self.text_embeddings = BertEmbeddings(bert_config) self.img_embeddings = UNITERImageEmbeddings( img_dim=img_dim, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, ) bert_model_name = bert_model_name hf_config = retry_n( NUM_RETRIES, BertConfig.from_pretrained, bert_model_name, **OmegaConf.to_container(encoder), ) if random_init: bert_model = BertModel(hf_config) else: bert_model = retry_n( NUM_RETRIES, BertModel.from_pretrained, bert_model_name, config=hf_config, ) self.encoder = bert_model.encoder self.pooler = bert_model.pooler
def __init__(self, config, bitW=1): super(QuantBertModel, self).__init__(config) self.config = config self.bitW = bitW self.embeddings = BertEmbeddings(config) self.encoder = QuantBertEncoder(config, self.bitW) # self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def _build_word_embedding(self): self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name) if self.config.pretrained_bert: bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name) self.word_embedding = bert_model.bert.embeddings self.pooler = bert_model.bert.pooler self.pooler.apply(self.init_weights) else: self.pooler = BertPooler(self.bert_config) self.word_embedding = BertEmbeddings(self.bert_config)
def __init__(self,config): super().__init__(config) self.config = config self.jointEmbeddings = JointEmbeddings(config.hidden_size,0.5,'mosei') self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) #self.Linear_v = nn.Linear() self.init_weights()
def __init__(self, config): super().__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder_attention(config) self.pooler = BertPooler(config) # self.pooler = BertPooler_Sigmoid(config) #self.pooler = BertPooler_reLu(config) #self.apply(self.init_weights) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) # 直接删除Pooler层,减少计算量和显存开销 # self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights()
def __init__(self): super(BertClassificationModel, self).__init__() model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-chinese') self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights) # 嵌入层BertEmbeddings(). self.embeddings = BertEmbeddings(config) # 多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. self.encoder = BertEncoder(config) self.bert = model_class.from_pretrained(pretrained_weights) self.dense = nn.Linear(768, 2) # bert默认的隐藏单元数是768, 输出单元是2,表示二分类 self.dropout = nn.Dropout(p=0.5) # dropout训练
def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights() self.length_config = None
def __init__(self, config: LukeConfig): super(LukeModelDoc, self).__init__() self.config = config self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if self.config.bert_model_name and "roberta" in self.config.bert_model_name: self.embeddings = RobertaEmbeddings(config) self.embeddings.token_type_embeddings.requires_grad = False else: self.embeddings = BertEmbeddings(config) self.entity_embeddings = EntityEmbeddings(config)
def __init__(self, config): super().__init__(config) self.config = config self.num_labels = config.num_labels self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.inject = DotAttention(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) # transformer blocks * N self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.MAG = MAG(beta=1.0, hidden_size=hidden_size, dropout=0.5, device=device) self.MAG.apply(_init_weights) self.init_weights()
def __init__(self, num_labels, pretrained_model_name_or_path=None, cat_num=0, token_size=None, MAX_SEQUENCE_LENGTH=512): super(BertModelForBinaryMultiLabelClassifier, self).__init__() if pretrained_model_name_or_path: self.model = BertModel.from_pretrained( pretrained_model_name_or_path) else: raise NotImplementedError self.num_labels = num_labels if cat_num > 0: self.catembedding = nn.Embedding(cat_num, 768) self.catdropout = nn.Dropout(0.2) self.catactivate = nn.ReLU() self.catembeddingOut = nn.Embedding(cat_num, cat_num // 2 + 1) self.catactivateOut = nn.ReLU() self.dropout = nn.Dropout(0.2) self.classifier = nn.Linear( self.model.pooler.dense.out_features + cat_num // 2 + 1, num_labels) else: self.catembedding = None self.catdropout = None self.catactivate = None self.catembeddingOut = None self.catactivateOut = None self.dropout = nn.Dropout(0.2) self.classifier = nn.Linear(self.model.pooler.dense.out_features, num_labels) # resize if token_size: self.model.resize_token_embeddings(token_size) # define input embedding and transformers input_model_config = BertConfig( vocab_size=token_size, max_position_embeddings=MAX_SEQUENCE_LENGTH) self.input_embeddings = BertEmbeddings(input_model_config) self.input_bert_layer = BertLayer(input_model_config) # use bertmodel as decoder # self.model.config.is_decoder = True # add modules self.add_module('my_input_embeddings', self.input_embeddings) self.add_module('my_input_bert_layer', self.input_bert_layer) self.add_module('fc_output', self.classifier)
def __init__(self, config, multimodal_config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.MAG = MAG( config.hidden_size, multimodal_config.beta_shift, multimodal_config.dropout_prob, ) self.init_weights()
def __init__(self, config: dict): # dont call constructor of bert-model but instead # call the constructor of bert-model super class super(BertModel, self).__init__(config) # basically the constructor of bert-model but # using know-bert-encoder instead of bert-encoder self.embeddings = BertEmbeddings(config) self.encoder = KnowBertEncoder(config) self.pooler = BertPooler(config) # initialize weights self.init_weights() # initialize helper KnowBertHelper.__init__(self, self.encoder)
def __init__(self, config): super(BertImgModel, self).__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = CaptionBertEncoder(config) self.pooler = BertPooler(config) self.img_dim = config.img_feature_dim logger.info('BertImgModel Image Dimension: {}'.format(self.img_dim)) self.img_feature_type = config.img_feature_type if hasattr(config, 'use_img_layernorm'): self.use_img_layernorm = config.use_img_layernorm else: self.use_img_layernorm = None if config.img_feature_type == 'dis_code': self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_dim, self.config.hidden_size, bias=True) elif config.img_feature_type == 'dis_code_t': # transpose self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_size, self.config.hidden_size, bias=True) elif config.img_feature_type == 'dis_code_scale': # scaled self.input_embeddings = nn.Linear(config.code_dim, config.code_size, bias=True) self.code_embeddings = nn.Embedding(config.code_voc, config.code_dim, padding_idx=0) self.img_embedding = nn.Linear(config.code_dim, self.config.hidden_size, bias=True) else: self.img_embedding = nn.Linear(self.img_dim, self.config.hidden_size, bias=True) self.dropout = nn.Dropout(config.hidden_dropout_prob) if self.use_img_layernorm: self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.img_layer_norm_eps) self.init_weights()
def __init__(self, config, model_size, task=None, n_classes=None): """ The bare Bert Model transformer outputting raw hidden-states without any specific head on top. The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Args: config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. model_size: Size of the Model task: MTB task n_classes: Number of classes References: Attention is all you need (https://arxiv.org/abs/1706.03762) """ super(BertModel, self).__init__(config) self.config = config self.task = task self.model_size = model_size self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.init_weights() logger.info("Model config: ", self.config) if self.task is None: self.lm_head = BertOnlyMLMHead(config) elif self.task == "classification": self.n_classes = n_classes if self.model_size == "bert-base-uncased": self.classification_layer = nn.Linear(1536, n_classes) elif self.model_size == "bert-large-uncased": self.classification_layer = nn.Linear(2048, n_classes)
def __init__(self, config, data_args=None, **kwargs): super().__init__(config) tasks = data_args.tasks self.task_id_2_task_idx = {i: i for i, t in enumerate(tasks)} self.config = config self.config.num_tasks = len(tasks) config.max_seq_length = data_args.max_seq_length self.task_type_embeddings = nn.Embedding(len(tasks), config.hidden_size) self.conditional_alignment = FiLM(config.hidden_size, config.hidden_size) # FiLM5 self.embeddings = BertEmbeddings(config) self.encoder = MyBertEncoder10(config, tasks) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config, visual_feat_size, visual_start_layer, num_visual_positions, use_pos_embedding=False, no_encoder_inputs=False, append_to_encoder_states=False): super().__init__(config) self.embeddings = BertEmbeddings(config) self.use_pos_embedding = use_pos_embedding if use_pos_embedding: self.visual_pos_embeddings = torch.nn.Embedding(num_visual_positions, config.hidden_size) else: self.visual_pos_embeddings = torch.nn.Linear(4, config.hidden_size) self.visual_feat_size = visual_feat_size self.visual_start_layer = visual_start_layer self.num_visual_positions = num_visual_positions self.visual_feat_projection = torch.nn.Linear(visual_feat_size, config.hidden_size) self.encoder = BertEncoder(config, visual_start_layer) self.pooler = BertPooler(config) self.dropout_layer = torch.nn.Dropout(p=self.config.hidden_dropout_prob) self.apply(self.init_bert_weights) self.no_encoder_inputs = no_encoder_inputs self.append_to_encoder_states = append_to_encoder_states
def __init__(self, config: BertConfig): super().__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.img_dim = config.img_feature_dim self.use_img_layernorm = getattr(config, "use_img_layernorm", False) img_projection = nn.Linear(self.img_dim, self.config.hidden_size, bias=True) img_embedding_list = [img_projection] if self.use_img_layernorm: img_embedding_list += [ nn.LayerNorm(config.hidden_size, eps=config.img_layer_norm_eps) ] dropout = nn.Dropout(config.hidden_dropout_prob) img_embedding_list += [dropout] # is an image encoding used as input to the transformer trunk self.img_embedding = nn.Sequential(*img_embedding_list)
def __init__(self, bert_model: str, max_layer=None, pool=True, freeze_embeddings=False): super().__init__() self.freeze_embeddings = freeze_embeddings config = BertConfig.from_pretrained(bert_model, cache_dir=TRANSFORMER_CACHE_DIR) if max_layer is not None and not pool: config.num_hidden_layers = max_layer self.pool = pool self.max_layer = max_layer self.embeddings = BertEmbeddings(config) if config.num_hidden_layers > 0: self.encoder = BertEncoder(config) self.encoder.output_hidden_states = True else: self.encoder = None if pool: self.pooler = BertPooler(config) else: self.pooler = None self.config = config self.bert_model = bert_model