def __init__(self, layer_num, head_num, head_size, weights=None): super().__init__() hidden_dim = head_num * head_size conf = BertConfig(hidden_size=hidden_dim, intermediate_size=4*hidden_dim, num_attention_heads=head_num, num_hidden_layers=layer_num) self.encoder = BertEncoder(conf) if isinstance(weights, dict): w = {} for k, v in weights.items(): if k.startswith('bert.encoder'): w[k[13:]] = weights[k] self.encoder.load_state_dict(w) else: for i in range(layer_num): self.encoder.layer[i].attention.self.query.weight.data = weights.w[i][0].transpose(-1, -2).contiguous() self.encoder.layer[i].attention.self.query.bias.data = weights.w[i][1] self.encoder.layer[i].attention.self.key.weight.data = weights.w[i][2].transpose(-1, -2).contiguous() self.encoder.layer[i].attention.self.key.bias.data = weights.w[i][3] self.encoder.layer[i].attention.self.value.weight.data = weights.w[i][4].transpose(-1, -2).contiguous() self.encoder.layer[i].attention.self.value.bias.data = weights.w[i][5] self.encoder.layer[i].attention.output.dense.weight.data = weights.w[i][6].transpose(-1, -2).contiguous() self.encoder.layer[i].attention.output.dense.bias.data = weights.w[i][7] self.encoder.layer[i].attention.output.LayerNorm.weight.data = weights.w[i][8] self.encoder.layer[i].attention.output.LayerNorm.bias.data = weights.w[i][9] self.encoder.layer[i].intermediate.dense.weight.data = weights.w[i][10].transpose(-1, -2).contiguous() self.encoder.layer[i].intermediate.dense.bias.data = weights.w[i][11] self.encoder.layer[i].output.dense.weight.data = weights.w[i][12].transpose(-1, -2).contiguous() self.encoder.layer[i].output.dense.bias.data = weights.w[i][13] self.encoder.layer[i].output.LayerNorm.weight.data = weights.w[i][14] self.encoder.layer[i].output.LayerNorm.bias.data = weights.w[i][15] self.head_mask = [None] * layer_num
def init_data(self, use_cuda) -> None: test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_encoder_layer = BertEncoder(self.cfg) self.torch_encoder_layer.eval() if use_cuda: self.torch_encoder_layer.to(test_device) self.batch_size = 1 self.seq_length = 40 self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(self.batch_size, self.seq_length, self.hidden_size), dtype=torch.float32, device=test_device) self.attention_mask = torch.ones((self.batch_size, self.seq_length), dtype=torch.float32, device=test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_bert_encoder = turbo_transformers.BertEncoder.from_torch( self.torch_encoder_layer)
def __init__(self, layer_num, head_num, head_size, weights=None): super().__init__() hidden_dim = head_num * head_size conf = BertConfig(hidden_size=hidden_dim, intermediate_size=4 * hidden_dim, num_attention_heads=head_num, num_hidden_layers=layer_num) self.encoder = BertEncoder(conf) w = {} for k, v in weights.weights.items(): if k.startswith('bert.encoder') and not k.endswith('_amax'): w[k[13:]] = weights.weights[k] self.encoder.load_state_dict(w) self.head_mask = [None] * layer_num
def build(self): self.text_processor = registry.get(self._datasets[0] + "_text_processor") self.vocab = self.text_processor.vocab self.word_embedding = self.vocab.get_embedding( torch.nn.Embedding, freeze=False, embedding_dim=self.config.text_embedding.embedding_dim) self.segment_embeddings = nn.Embedding(self.config.num_segment_type, self.config.hidden_size) self.cls_project = nn.Linear(self.config.text_embedding.embedding_dim, self.config.hidden_size) self.lstm = nn.LSTM(**self.config.lstm) self.lstm_proj = nn.Linear(self.config.hidden_size * 2, self.config.hidden_size) self.img_encoder = ImageClevrEncoder(self.config) self.img_pos_emb = nn.Linear(2, self.config.hidden_size) self.LayerNorm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_eps) self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)) self.transformer = BertEncoder(self.bert_config) self.pooler = BertPooler(self.bert_config) self.classifier = nn.Sequential( BertPredictionHeadTransform(self.config), nn.Linear(self.config.hidden_size, self.config.num_labels), ) self.head_mask = [None for _ in range(self.config.num_hidden_layers)]
def __init__(self, config, num_tune_layer): super().__init__(config) self.num_labels = config.num_labels config.num_tune_layer = num_tune_layer # Layer number to start fine-tuning from self.bert = BertModel(config) self.freeze_bert_layers(self.bert, num_tune_layer) self.dropout = nn.Dropout(config.classifier_dropout_prob) self.word_attn_linear = nn.Linear(config.hidden_size, config.hidden_size) self.word_attn_vector = nn.Parameter(norm_weight(config.hidden_size, None)) self.sent_attn_linear_nocontext = nn.Linear(config.hidden_size, config.hidden_size) self.sent_attn_vector_nocontext = nn.Parameter(norm_weight(config.hidden_size, None)) self.sent_attn_linear_context = nn.Linear(config.hidden_size, config.hidden_size) self.sent_attn_vector_context = nn.Parameter(norm_weight(config.hidden_size, None)) self.classifier_esm = nn.Linear(config.hidden_size, self.config.num_labels) self.classifier_agg = nn.Linear(config.hidden_size, self.config.num_labels) self.config_custom = copy.deepcopy(config) self.config_custom.num_hidden_layers = 2 self.bert_encoder_custom = BertEncoder(self.config_custom) self.beta1 = nn.Parameter(torch.Tensor([0.5])) self.beta2 = nn.Parameter(torch.Tensor([0.5])) self.init_weights()
def __init__(self, config): super(BertModelDialog, self).__init__(config) self.embeddings = BertEmbeddingsDialog(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super(BertModel, self).__init__(config) self.embeddings = SuperPositionalBertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__( self, config, visual_embedding_dim=512, embedding_strategy="plain", bypass_transformer=False, output_attentions=False, output_hidden_states=False, ): super().__init__(config) self.config = config config.visual_embedding_dim = visual_embedding_dim config.embedding_strategy = embedding_strategy config.bypass_transformer = bypass_transformer config.output_attentions = output_attentions config.output_hidden_states = output_hidden_states self.embeddings = BertVisioLinguisticEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.bypass_transformer = config.bypass_transformer if self.bypass_transformer: self.additional_layer = BertLayer(config) self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.fixed_head_masks = [None for _ in range(len(self.encoder.layer))] self.init_weights()
def __init__(self, config): super().__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()
def __init__(self, cfg): super(TransfomerModel, self).__init__() self.cfg = cfg cont_col_size = len(cfg.cont_cols) self.cont_emb = nn.Sequential( nn.Linear(cont_col_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), ) self.position_emb = nn.Embedding(num_embeddings=self.cfg.seq_len, embedding_dim=cfg.hidden_size) self.ln = nn.LayerNorm(cfg.hidden_size) self.config = BertConfig( 3, # not used hidden_size=cfg.hidden_size, num_hidden_layers=cfg.nlayers, num_attention_heads=cfg.nheads, intermediate_size=cfg.hidden_size, hidden_dropout_prob=cfg.dropout, attention_probs_dropout_prob=cfg.dropout, ) self.encoder = BertEncoder(self.config) def get_reg(): return nn.Sequential( nn.Linear(cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.target_size), ) self.reg_layer = get_reg()
def __init__(self, cfg, args, tok): super().__init__(cfg) self.embeddings = VideoTransformerEmbedder(cfg, args, tok) self.encoder = BertEncoder(cfg) self.init_weights() self.args = args self.tok = tok self.cfg = cfg
def __init__(self, config): super().__init__(config) self.embeddings = ElectraEmbeddings(config) self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) self.encoder = BertEncoder(config) self.config = config self.init_weights()
def __init__(self, config): super(LayoutLMModel, self).__init__(config) self.embeddings = LayoutLMEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, **kwargs): config = BertConfig(**kwargs) super().__init__(config) self.prev_pred_embeddings = PrevPredEmbeddings(config) self.encoder = BertEncoder(config) self.init_weights()
def __init__(self, config): super().__init__(config) # self.config中包含了拼写错误纠正网络Correction_Network中的Bert模型的各种配置超参数. self.config = config '''一、构建错误探查网络Detection_Network中所需的网络层''' # Bi-GRU网络作为错误探查网络Detection_Network的编码器 # 此处由于BertModel中的embeddings层中所有子嵌入模块的嵌入维度都为768, 所以此处Bi-GRU网络的input_size也为768, # 而将Bi-GRU网络的hidden_size设为256,是为了保证Bi-GRU网络双向编码后双向隐藏层拼接到一块后隐藏层维度能保持在512. # 此时enc_hid_size为512. self.enc_bi_gru = torch.nn.GRU(input_size=768, hidden_size=256, dropout=0.2, bidirectional=True) # 双向GRU编码层对于输入错误探查网络Detection_Network中的input_embeddings进行双向编码, # 此时双向GRU编码层的输出为(seq_len, batch_size, enc_hid_size * 2),将其交换维度变形为(batch_size, seq_len, enc_hid_size * 2), # 再将双向GRU编码层的变形后的输出输入self.detection_network_dense_out层中,映射为形状(batch_size, seq_len, 2)的张量, # 这样方便后面进行判断句子序列中每一个字符是否为拼写错误字符的二分类任务的交叉熵损失值计算. self.detection_network_dense_out = torch.nn.Linear(512, 2) # 同时,将双向GRU编码层输出后经过变形的形状为(batch_size, seq_len, enc_hid_size * 2),的张量输入进soft_masking_coef_mapping层中, # 将其形状映射为(batch_size, seq_len, 1)的张量,此张量再在后面输入进Sigmoid()激活函数中, 将此张量的值映射至(0,1)之间, # 这样这个张量即变为了后面计算soft-masked embeddings时和mask_embeddings相乘的系数p (结果pi即可表示为文本序列中第i处的字符拼写错误的似然概率(likelihood)). self.soft_masking_coef_mapping = torch.nn.Linear(512, 1) '''二、构建的拼写错误纠正网络Correction_Network中BertModel中所用的个三种网络层''' ''' (1): 嵌入层BertEmbeddings(),其中包含了每个character的word embedding、segment embeddings、position embedding三种嵌入函数. (2): Bert模型的核心,多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. (3): Bert模型最后的池化层BertPooler. ''' # 嵌入层BertEmbeddings(). self.embeddings = BertEmbeddings(config) # 多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. self.encoder = BertEncoder(config) # 池化层BertPooler。 self.pooler = BertPooler(config) # 初始化权重矩阵,偏置等. self.init_weights() '''获取遮罩特殊符[MASK]在Bert模型的嵌入层BertEmbeddings()中的词嵌入层word_embeddings层中特殊符[MASK]所对应索引的嵌入向量(embeddins vector)''' # 在Bert模型的tokenizer类BertTokenizer()的词表中,遮罩特殊符[MASK]会被编码为索引103(只要是BertTokenizer()类,无论其from_pretrained哪种 # 预训练的Bert模型词表,遮罩特殊符[MASK]在词表中的索引都为103; 除非换预训练模型如换成Albert模型,遮罩特殊符[MASK]在词表中的索引才会变, 否则 # 遮罩特殊符[MASK]在同一类预训练Bert模型的词表下索引不变). # 在之后, 遮罩特殊符[MASK]的张量self.mask_embedding的形状要变为和Bert模型嵌入层BertEmbeddings()的输出input_embeddings张量的形状一样, # 此时self.mask_embeddings张量的形状要为(batch_size, seq_len, embed_size)->(batch_size, seq_len, 768). self.mask_embeddings = self.embeddings.word_embeddings.weight[ 103] # 此时,mask_embedding张量的形状为(768,) # 注意!: 在soft_masked_embeddings输入拼写错误纠正网络correction network中的Bert模型后,其计算结果输入进最终的输出层与Softmax层之前, # 拼写错误纠正网络correction network的结果需通过残差连接residual connection与输入模型一开始的input embeddings相加, # 相加的结果才输入最终的输出层与Softmax层中做最终的正确字符预测。 '''self.soft_masked_bert_dense_out即为拼写错误纠正网络correction network之后的输出层, 其会将经过残差连接模块residual connection之后 的输出的维度由768投影到纠错词表的索引空间. (此处输出层self.soft_masked_bert_dense_out的输出即可被视为Soft_Masked_BERT模型的最终输出)''' self.soft_masked_bert_dense_out = torch.nn.Linear( self.config.hidden_size, self.embeddings.word_embeddings.weight.shape[0]) '''此处可不写最后的Softmax()函数, 因为若之后在训练模型时使用CrossEntropyLoss()交叉熵函数来计算损失值的话, CrossEntropyLoss()函数
def _create_span_encoder(self, kb, span_encoder_config): # check if encoder should be used if span_encoder_config is None: # return identity function as encoder return lambda t, m, h: t # update values to match dimensions span_encoder_config.hidden_size = kb.embedd_dim # create config and encoder return BertEncoder(span_encoder_config)
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddingsWithWordMasking(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super(BertBaseModel, self).__init__(config) self.config = config self.embeddings = BertBaseEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertCharacterEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights() self.use_ext_encoder = False
def __init__(self, config, v_dim, l_dim, loc_dim, backbone): super(TransformerHead, self).__init__() self.config = config.MODEL.MMSS_HEAD.TRANSFORMER self.v_dim = v_dim self.l_dim = l_dim self.loc_dim = loc_dim self.backbone = backbone self.mvm_loss = self.config.MVM_LOSS self.mmm_loss = self.config.MMM_LOSS self.num_negative = self.config.MVM_LOSS_NUM_NEGATIVE self.bert_config = BertConfig(**self.config.BERT_CONFIG) self.v2l_projection = nn.Linear(self.v_dim, self.l_dim) self.visual_emb = VisualEmbedding(self.bert_config, self.l_dim, self.loc_dim) self.encoder = BertEncoder(self.bert_config) self.pooler = BertPooler(self.bert_config) self.heads = MMPreTrainingHeads(self.bert_config, self.v_dim) self.encoder.apply(self._init_weights) self.pooler.apply(self._init_weights) self.heads.apply(self._init_weights) self._tie_weights() self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1) if self.mvm_loss == 'reconstruction_error': self.vis_criterion = nn.MSELoss(reduction="none") elif self.mvm_loss == 'contrastive_cross_entropy': self.vis_criterion = nn.CrossEntropyLoss() elif self.mvm_loss == '': self.vis_criterion = None for p in self.heads.imagePredictions.parameters(): p.requires_grad = False else: raise NotImplementedError if self.mmm_loss == '': for p in self.pooler.parameters(): p.requires_grad = False for p in self.heads.bi_seq_relationship.parameters(): p.requires_grad = False
def __init__(self, config): super().__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) #self.pooler = BertPooler(config) self.pooler = BertPooler_Sigmoid(config) #self.pooler = BertPooler_reLu(config) #self.apply(self.init_weights) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) # 直接删除Pooler层,减少计算量和显存开销 # self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights()
def __init__(self,config): super().__init__(config) self.config = config self.jointEmbeddings = JointEmbeddings(config.hidden_size,0.5,'mosei') self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) #self.Linear_v = nn.Linear() self.init_weights()
def __init__(self): super(BertClassificationModel, self).__init__() model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-chinese') self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights) # 嵌入层BertEmbeddings(). self.embeddings = BertEmbeddings(config) # 多层(12层)多头自注意力(multi-head self attention)编码层BertEncoder. self.encoder = BertEncoder(config) self.bert = model_class.from_pretrained(pretrained_weights) self.dense = nn.Linear(768, 2) # bert默认的隐藏单元数是768, 输出单元是2,表示二分类 self.dropout = nn.Dropout(p=0.5) # dropout训练
def __init__(self, config): super().__init__(config) self.embeddings = ElectraEmbeddings(config) if config.embedding_size != config.hidden_size: self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) self.encoder = BertEncoder(config) self.dense = nn.Linear(256, 256) self.dropout = nn.Dropout(0.1) self.out_proj = nn.Linear(256, 2) self.init_weights()
class HuggingFaceEncoder(torch.nn.Module): def __init__(self, layer_num, head_num, head_size, weights=None): super().__init__() hidden_dim = head_num * head_size conf = BertConfig(hidden_size=hidden_dim, intermediate_size=4 * hidden_dim, num_attention_heads=head_num, num_hidden_layers=layer_num) self.encoder = BertEncoder(conf) w = {} for k, v in weights.weights.items(): if k.startswith('bert.encoder') and not k.endswith('_amax'): w[k[13:]] = weights.weights[k] self.encoder.load_state_dict(w) self.head_mask = [None] * layer_num def forward(self, hidden_states, attention_mask): extended_attention_mask = (1.0 - attention_mask) * -10000.0 output = self.encoder(hidden_states, extended_attention_mask, self.head_mask) return output
def __init__(self, config: LukeConfig): super(LukeModelDoc, self).__init__() self.config = config self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if self.config.bert_model_name and "roberta" in self.config.bert_model_name: self.embeddings = RobertaEmbeddings(config) self.embeddings.token_type_embeddings.requires_grad = False else: self.embeddings = BertEmbeddings(config) self.entity_embeddings = EntityEmbeddings(config)
def __init__(self, config): super().__init__(config) self.config = config self.num_labels = config.num_labels self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.inject = DotAttention(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, config): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) # transformer blocks * N self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.MAG = MAG(beta=1.0, hidden_size=hidden_size, dropout=0.5, device=device) self.MAG.apply(_init_weights) self.init_weights()