class roBerta(nn.Module): def __init__(self, config, num=0): super(roBerta, self).__init__() model_config = RobertaConfig() model_config.vocab_size = config.vocab_size model_config.hidden_size = config.hidden_size[0] model_config.num_attention_heads = 16 # 计算loss的方法 self.loss_method = config.loss_method self.multi_drop = config.multi_drop self.roberta = RobertaModel(model_config) if config.requires_grad: for param in self.roberta.parameters(): param.requires_grad = True self.dropout = nn.Dropout(config.hidden_dropout_prob) self.hidden_size = config.hidden_size[num] if self.loss_method in ['binary', 'focal_loss', 'ghmc']: self.classifier = nn.Linear(self.hidden_size, 1) else: self.classifier = nn.Linear(self.hidden_size, self.num_labels) self.text_linear = nn.Linear(config.embeding_size, config.hidden_size[0]) self.vocab_layer = nn.Linear(config.hidden_size[0], config.vocab_size) self.classifier.apply(self._init_weights) self.roberta.apply(self._init_weights) self.text_linear.apply(self._init_weights) self.vocab_layer.apply(self._init_weights) def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=0.02) def forward(self, inputs=None, attention_mask=None, output_id=None, labels=None): inputs = torch.relu(self.text_linear(inputs)) bert_outputs = self.roberta(inputs_embeds=inputs, attention_mask=attention_mask) #calculate mlm loss last_hidden_state = bert_outputs[0] output_id_tmp = output_id[output_id.ne(-100)] output_id_emb = last_hidden_state[output_id.ne(-100)] pre_score = self.vocab_layer(output_id_emb) loss_cro = CrossEntropyLoss() mlm_loss = loss_cro(torch.sigmoid(pre_score), output_id_tmp) labels_bool = labels.ne(-1) if labels_bool.sum().item() == 0: return mlm_loss, torch.tensor([]) #calculate label loss pooled_output = bert_outputs[1] out = self.classifier(pooled_output) out = out[labels_bool] labels_tmp = labels[labels_bool] label_loss = compute_loss(out, labels_tmp) out = torch.sigmoid(out).flatten() return mlm_loss + label_loss, out return out, loss
class ClassifyModel(nn.Module): def __init__(self, args): super(ClassifyModel, self).__init__() args.out_size = len(args.dense_features) self.dropout = nn.Dropout(args.hidden_dropout_prob) self.args = args # 创建BERT模型,并且导入预训练模型 config = RobertaConfig.from_pretrained(args.pretrained_model_path) config.output_hidden_states = True args.hidden_size = config.hidden_size args.num_hidden_layers = config.num_hidden_layers self.bert_text_layer = RobertaModel.from_pretrained(args.pretrained_model_path, config=config) self.text_linear = nn.Linear(in_features=args.text_dim + args.vocab_dim_v1 * len(args.text_features), out_features=args.hidden_size) logger.info("Load linear from %s", os.path.join(args.pretrained_model_path, "linear.bin")) self.text_linear.load_state_dict(torch.load(os.path.join(args.pretrained_model_path, "linear.bin"))) logger.info("Load embeddings from %s", os.path.join(args.pretrained_model_path, "embeddings.bin")) self.text_embeddings = nn.Embedding.from_pretrained( torch.load(os.path.join(args.pretrained_model_path, "embeddings.bin"))['weight'], freeze=True) args.out_size += args.hidden_size * 2 # 创建fusion-layer模型,随机初始化 config = RobertaConfig() config.num_hidden_layers = 4 config.intermediate_size = 2048 config.hidden_size = 512 config.num_attention_heads = 16 config.vocab_size = 5 self.fusion_text_layer = RobertaModel(config=config) self.fusion_text_layer.apply(self._init_weights) self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512) self.text_linear_1.apply(self._init_weights) self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size) args.out_size += 1024 # 创建分类器,随机初始化 self.classifierHead = ClassificationHead(args) self.classifierHead.apply(self._init_weights) def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=0.02) def forward(self, dense_features, text_features, text_ids, text_masks, fusion_text_features, fusion_text_masks, labels=None): outputs = [] # 获取浮点数,作为分类器的输入 outputs.append(dense_features.float()) # 获取BERT模型的hidden state,并且做max pooling和mean pooling作为分类器的输入 text_masks = text_masks.float() text_embedding = self.text_embeddings(text_ids).view(text_ids.size(0), text_ids.size(1), -1) # reshape text_features = torch.cat((text_features.float(), text_embedding), -1) # concat text_features = torch.relu(self.text_linear(self.dropout(text_features))) # relu hidden_states = self.bert_text_layer(inputs_embeds=text_features, attention_mask=text_masks)[0] # bert_text_layer embed_mean = (hidden_states * text_masks.unsqueeze(-1)).sum(1) / text_masks.sum(1).unsqueeze(-1) embed_mean = embed_mean.float() embed_max = hidden_states + (1 - text_masks).unsqueeze(-1) * (-1e10) embed_max = embed_max.max(1)[0].float() # bert的embedding的mean, max作为分类器的输入 outputs.append(embed_mean) outputs.append(embed_max) # 获取fusion-layer的hidden state,并且做max pooling和mean pooling作为分类器的输入 fusion_text_masks = fusion_text_masks.float() fusion_text_features = torch.cat((fusion_text_features.float(), hidden_states), -1) batch, seq_length, embedding_dim = fusion_text_features.size() fusion_text_features = self.norm(fusion_text_features.view(-1, embedding_dim))\ .view(batch, seq_length, embedding_dim) fusion_text_features = torch.relu(self.text_linear_1(fusion_text_features)) hidden_states = self.fusion_text_layer(inputs_embeds=fusion_text_features, attention_mask=fusion_text_masks)[0] # transfromer fusion embed_mean = (hidden_states * fusion_text_masks.unsqueeze(-1)).sum(1) / fusion_text_masks.sum(1).unsqueeze(-1) embed_mean = embed_mean.float() embed_max = hidden_states + (1 - fusion_text_masks).unsqueeze(-1) * (-1e10) embed_max = embed_max.max(1)[0].float() outputs.append(embed_mean) outputs.append(embed_max) # 将特征(bert max/mean pooling+fusion layer)输入分类器,得到20分类的logits # 年龄10维,性别2维,交叉之后就是20维 final_hidden_state = torch.cat(outputs, dim=-1) logits = self.classifierHead(final_hidden_state) # 返回loss或概率结果 if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits, labels) return loss else: # prob:[batch, age, gender] prob = torch.softmax(logits, -1) # age_probs:[batch, age], 将每个age下的各个gender相加就可以得到该age的概率 age_probs = prob.view(-1, 10, 2).sum(dim=2,keepdims=False) # gender_probs:[batch, gender] gender_probs = prob.view(-1, 10, 2).sum(1) return age_probs, gender_probs
class Model(nn.Module): def __init__(self, args): super(Model, self).__init__() args.out_size = len(args.dense_features) self.dropout = nn.Dropout(args.hidden_dropout_prob) self.args = args #创建BERT模型,并且导入预训练模型 config = RobertaConfig.from_pretrained(args.pretrained_model_path) config.output_hidden_states = True args.hidden_size = config.hidden_size args.num_hidden_layers = config.num_hidden_layers self.text_layer = RobertaModel.from_pretrained( args.pretrained_model_path, config=config) self.text_linear = nn.Linear( args.text_dim + args.vocab_dim_v1 * len(args.text_features), args.hidden_size) logger.info("Load linear from %s", os.path.join(args.pretrained_model_path, "linear.bin")) self.text_linear.load_state_dict( torch.load(os.path.join(args.pretrained_model_path, "linear.bin"))) logger.info("Load embeddings from %s", os.path.join(args.pretrained_model_path, "embeddings.bin")) self.text_embeddings = nn.Embedding.from_pretrained(torch.load( os.path.join(args.pretrained_model_path, "embeddings.bin"))['weight'], freeze=True) args.out_size += args.hidden_size * 2 #创建Decoder模型,随机初始化 config = RobertaConfig() config.num_hidden_layers = 4 config.intermediate_size = 2048 config.hidden_size = 512 config.num_attention_heads = 16 config.vocab_size = 5 self.text_layer_1 = RobertaModel(config=config) self.text_layer_1.apply(self._init_weights) self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512) self.text_linear_1.apply(self._init_weights) self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size) args.out_size += 1024 #创建分类器,随机初始化 self.classifier = ClassificationHead(args) self.classifier.apply(self._init_weights) def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=0.02) def forward(self, dense_features, text_features, text_ids, text_masks, text_features_1, text_masks_1, labels=None): outputs = [] #获取浮点数,作为分类器的输入 outputs.append(dense_features.float()) #获取BERT模型的hidden state,并且做max pooling和mean pooling作为分类器的输入 text_masks = text_masks.float() text_embedding = self.text_embeddings(text_ids).view( text_ids.size(0), text_ids.size(1), -1) text_features = torch.cat((text_features.float(), text_embedding), -1) text_features = torch.relu( self.text_linear(self.dropout(text_features))) hidden_states = self.text_layer(inputs_embeds=text_features, attention_mask=text_masks)[0] embed_mean = (hidden_states * text_masks.unsqueeze(-1) ).sum(1) / text_masks.sum(1).unsqueeze(-1) embed_mean = embed_mean.float() embed_max = hidden_states + (1 - text_masks).unsqueeze(-1) * (-1e10) embed_max = embed_max.max(1)[0].float() outputs.append(embed_mean) outputs.append(embed_max) #获取decoder的hidden state,并且做max pooling和mean pooling作为分类器的输入 text_masks_1 = text_masks_1.float() text_features_1 = torch.cat((text_features_1.float(), hidden_states), -1) bs, le, dim = text_features_1.size() text_features_1 = self.norm(text_features_1.view(-1, dim)).view( bs, le, dim) text_features_1 = torch.relu(self.text_linear_1(text_features_1)) hidden_states = self.text_layer_1(inputs_embeds=text_features_1, attention_mask=text_masks_1)[0] embed_mean = (hidden_states * text_masks_1.unsqueeze(-1) ).sum(1) / text_masks_1.sum(1).unsqueeze(-1) embed_mean = embed_mean.float() embed_max = hidden_states + (1 - text_masks_1).unsqueeze(-1) * (-1e10) embed_max = embed_max.max(1)[0].float() outputs.append(embed_mean) outputs.append(embed_max) #将特征输入分类器,得到20分类的logits final_hidden_state = torch.cat(outputs, -1) logits = self.classifier(final_hidden_state) #返回loss或概率结果 if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits, labels) return loss else: prob = torch.softmax(logits, -1) age_probs = prob.view(-1, 10, 2).sum(2) gender_probs = prob.view(-1, 10, 2).sum(1) return age_probs, gender_probs