def build_model_layers(self): ''' builds the layers in the model ''' self.bert = BertForTokenClassification.from_pretrained( "bert-base-cased", num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) if self.use_crf: self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
def build_model(self): ''' build the embedding layer, lstm layer and CRF layer ''' self.hidden2tag = nn.Linear(self.embedding_dim, self.n_tags) crf_config = { 'n_tags': self.config['n_ent_tags'], 'start_idx': self.config['start_ent_idx'], 'end_idx': self.config['end_ent_idx'], 'use_cuda': self.use_cuda } self.crf = CRF(crf_config) self.bert = transformers.BertModel.from_pretrained('bert-base-chinese')
def build_model_layers(self): ''' builds the layers in the model ''' # embedding layer self.embedding = nn.Embedding(num_embeddings=self.input_dim, embedding_dim=self.embedding_dim, padding_idx=self.text_pad_idx) # dropout for embedding layer self.embedding_dropout = nn.Dropout(self.embedding_dropout_ratio) # character cnn if self.char_embedding_dim: self.char_embedding = nn.Embedding(num_embeddings=self.char_input_dim, embedding_dim=self.char_embedding_dim, padding_idx=self.char_pad_idx) self.char_cnn = nn.Conv1d(in_channels=self.char_embedding_dim, out_channels=self.char_embedding_dim*self.char_filter, kernel_size=self.char_kernel, groups=self.char_embedding_dim) self.cnn_dropout = nn.Dropout(self.cnn_dropout_ratio) all_embedding_dim = self.embedding_dim+(self.char_embedding_dim*self.char_filter) # lstm layers with dropout else: all_embedding_dim = self.embedding_dim # lstm layers with dropout self.lstm = nn.LSTM(batch_first=True, input_size=all_embedding_dim, hidden_size=self.hidden_dim, num_layers=self.lstm_layers, bidirectional=True, dropout=self.lstm_dropout_ratio if self.lstm_layers > 1 else 0) # use multihead attention if there are attention heads if self.attn_heads: self.attn = nn.MultiheadAttention(embed_dim=self.hidden_dim*2, num_heads=self.attn_heads, dropout=self.attn_dropout_ratio) # dropout for fully connected layer self.fc_dropout = nn.Dropout(self.fc_dropout_ratio) # fully connected layer self.fc = nn.Linear(self.hidden_dim*2, self.output_dim) # use crf layer if it is switched on if self.use_crf: self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
def build_model(self): ''' build the embedding layer, lstm layer and CRF layer ''' self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, batch_first=True, num_layers=self.lstm_layer_num, dropout=self.dropout_prob, bidirectional=True) self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags) crf_config = { 'n_tags': self.config['n_ent_tags'], 'start_idx': self.config['start_ent_idx'], 'end_idx': self.config['end_ent_idx'], 'use_cuda': self.use_cuda } self.crf = CRF(crf_config)
def build_model_layers(self): ''' builds the layers in the model ''' # embedding layer self.embedding = nn.Embedding(num_embeddings=self.input_dim, embedding_dim=self.embedding_dim, padding_idx=self.text_pad_idx) # dropout for embedding layer self.embedding_dropout = nn.Dropout(self.embedding_dropout_ratio) # character cnn if self.char_embedding_dim: self.char_embedding = nn.Embedding( num_embeddings=self.char_input_dim, embedding_dim=self.char_embedding_dim, padding_idx=self.char_pad_idx) self.char_cnn = nn.Conv1d(in_channels=self.char_embedding_dim, out_channels=self.char_embedding_dim * self.char_filter, kernel_size=self.char_kernel, groups=self.char_embedding_dim) self.cnn_dropout = nn.Dropout(self.cnn_dropout_ratio) # lstm layers with dropout all_embedding_dim = self.embedding_dim + (self.char_embedding_dim * self.char_filter) else: all_embedding_dim = self.embedding_dim # transformer encoder layers with attention and dropout self.position_encoder = PositionalEncoding(d_model=all_embedding_dim) encoder_layers = nn.TransformerEncoderLayer( d_model=all_embedding_dim, nhead=self.attn_heads, activation='relu', dropout=self.trf_dropout_ratio) self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layers, num_layers=self.trf_layers) # fully connected layer with gelu activation self.fc1 = nn.Linear(in_features=all_embedding_dim, out_features=self.hidden_dim) self.fc1_gelu = nn.GELU() # layer norm self.fc1_norm = nn.LayerNorm(self.hidden_dim) # dropout for fully connected layer self.fc2_dropout = nn.Dropout(self.fc_dropout_ratio) # fully connected layer self.fc2 = nn.Linear(self.hidden_dim, self.output_dim) # use crf layer if it is switched on if self.use_crf: self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
class REL_BLSTM_CRF(MODEL_TEMP): def __init__(self, config={}, show_param=False): ''' :param - dict param['embedding_dim'] param['hidden_dim'] ***param['n_ent_tags'] param['n_rel_tags'] param['n_rels'] param['n_words'] param['start_idx'] int, <start> tag index for entity tag seq param['end_idx'] int, <end> tag index for entity tag seq param['use_cuda'] param['dropout_prob'] param['lstm_layer_num'] ''' super(REL_BLSTM_CRF, self).__init__() self.config = config self.embedding_dim = self.config.get('embedding_dim', 128) self.hidden_dim = self.config.get('hidden_dim', 64) assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even' self.n_tags = self.config.get('n_rel_tags', 8) self.n_rels = self.config.get('n_rels', 9) self.n_words = self.config.get('n_words', 10000) self.dropout_prob = self.config.get('dropout_prob', 0) self.lstm_layer_num = self.config.get('lstm_layer_num', 1) self.use_cuda = self.config.get('use_cuda', False) self.model_type = 'REL_BLSTM_CRF' self.build_model() self.reset_parameters() if show_param: self.show_model_param() def show_model_param(self): log('=' * 80, 0) log(f'model_type: {self.model_type}', 1) log(f'embedding_dim: {self.embedding_dim}', 1) log(f'hidden_dim: {self.hidden_dim}', 1) log(f'use_cuda: {self.use_cuda}', 1) log(f'lstm_layer_num: {self.lstm_layer_num}', 1) log(f'dropout_prob: {self.dropout_prob}', 1) log('=' * 80, 0) def build_model(self): ''' build the embedding layer, lstm layer and CRF layer ''' self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim) self.rel_embeds = nn.Embedding(self.n_rels, self.embedding_dim) self.embed2hidden = nn.Linear(self.embedding_dim * 2, self.embedding_dim) self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim // 2, batch_first=True, num_layers=self.lstm_layer_num, dropout=self.dropout_prob, bidirectional=True) self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags) crf_config = { 'n_tags': self.n_tags, 'start_idx': self.config['start_rel_idx'], 'end_idx': self.config['end_rel_idx'], 'use_cuda': self.use_cuda } self.crf = CRF(crf_config) self.relu_layer = nn.ReLU() def reset_parameters(self): I.xavier_normal_(self.word_embeds.weight.data) I.xavier_normal_(self.rel_embeds.weight.data) self.lstm.reset_parameters() # stdv = 1.0 / math.sqrt(self.hidden_dim) # for weight in self.lstm.parameters(): # I.uniform_(weight, -stdv, stdv) I.xavier_normal_(self.embed2hidden.weight.data) I.xavier_normal_(self.hidden2tag.weight.data) self.crf.reset_parameters() def _get_lstm_features(self, x, relation_type, use_cuda=None): ''' :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @relation_type: index之后的relation_type, (batch_size, 1), np.array :return @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda batch_size, T = x.shape[0], x.shape[1] ##embedding layer words_tensor = self._to_tensor(x, use_cuda) #(batch_size, T) # print('words_tensor_shape', words_tensor.shape) word_input_embeds = self.word_embeds( words_tensor) #(batch_size, T, n_embed) # print('word_input_embeds_shape', word_input_embeds.shape) reltype_tensor = self._to_tensor(relation_type, use_cuda) #(batch_size, 1) # print('reltype_tensor', reltype_tensor.shape) reltype_input_embeds = self.rel_embeds( reltype_tensor) #(batch_size, 1, n_embed) # print('reltype_input_embeds', reltype_input_embeds.shape) reltype_input_embeds = reltype_input_embeds.repeat( 1, T, 1) #(batch_size, T, n_embed) # print('reltype_input_embeds2', reltype_input_embeds.shape) input_embeds_all = torch.cat([word_input_embeds, reltype_input_embeds], -1) #(batch_size, T, n_embed*2) # print('input_embeds_all.shape', input_embeds_all.shape) embeds = self.embed2hidden( input_embeds_all) #(batch_size, T, n_embeds) # print('embeds.shape', embeds.shape) # ##LSTM layer if use_cuda: h_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2).cuda() #(n_layer*n_dir, N, n_hid) c_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2).cuda() else: h_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2) c_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2) # c_0 = h_0.clone() hidden = (h_0, c_0) lstm_out, _hidden = self.lstm( embeds, hidden) #(batch_size, T, n_dir*n_hid), (h, c) ##FC layer lstm_feature = self.hidden2tag(lstm_out) #(batch_size, T, n_tags) lstm_feature = torch.tanh(lstm_feature) # print(lstm_feature.shape) return lstm_feature def _loss(self, x, relation_type, y_rel, lens, use_cuda=None): ''' loss function: neg_log_likelihood :param @x: (batch_size, T), np.array, index之后的word, 每个字符按照字典对应到index, @relation_type: (batch_size, 1), np.array, 关系类别 @y_rel: (batch_size, T), np.array, index之后的关系序列, 字符级别, @lens: (batch_size), list, 具体每个句子的长度, :return @loss: (batch_size), torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_lstm_features(x, relation_type, use_cuda) log_norm_score = self.crf.log_norm_score(logits, lens) path_score = self.crf.path_score(logits, y_rel, lens) loss = log_norm_score - path_score loss = (loss / self._to_tensor(lens, use_cuda).float()).mean() return loss def _output(self, x, relation_type, lens, use_cuda=None): ''' return the crf decode paths :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @relation_type: (batch_size, 1), np.array, 关系类别 @lens: (batch_size), list, 具体每个句子的长度, :return @paths: (batch_size, T), torch.tensor, 最佳句子路径 @scores: (batch_size), torch.tensor, 最佳句子路径上的得分 ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_lstm_features(x, relation_type, use_cuda) scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda) return paths def train_model(self, data_loader: KGDataLoader, train_dataset=None, eval_dataset=None, hyper_param={}, use_cuda=None, rebuild=False): ''' :param @data_loader: (KGDataLoader), @result_dir: (str) path to save the trained model and extracted dictionary @hyper_param: (dict) @hyper_param['EPOCH'] @hyper_param['batch_size'] @hyper_param['learning_rate_upper'] @hyper_param['learning_rate_bert'] @hyper_param['bert_finetune'] @hyper_param['visualize_length'] #num of batches between two check points @hyper_param['isshuffle'] @hyper_param['result_dir'] @hyper_param['model_name'] :return @loss_record, @score_record ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda if use_cuda: print('use cuda=========================') self.cuda() EPOCH = hyper_param.get('EPOCH', 3) BATCH_SIZE = hyper_param.get('batch_size', 4) LEARNING_RATE_upper = hyper_param.get('learning_rate_upper', 1e-2) LEARNING_RATE_bert = hyper_param.get('learning_rate_bert', 5e-5) bert_finetune = hyper_param.get('bert_finetune', True) visualize_length = hyper_param.get('visualize_length', 10) result_dir = hyper_param.get('result_dir', './result/') model_name = hyper_param.get('model_name', 'model.p') is_shuffle = hyper_param.get('isshuffle', True) DATA_TYPE = 'rel' train_dataset = data_loader.dataset.train_dataset if train_dataset is None else train_dataset if rebuild: train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* else: old_train_dict_path = os.path.join(result_dir, 'train_data_mat_dict.pkl') if os.path.exists(old_train_dict_path): train_data_mat_dict = data_loader.load_preprocessed_data( old_train_dict_path) log('Reload preprocessed data successfully~') else: # train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE) train_data_mat_dict = data_loader.transform( train_dataset, istest=False, data_type=DATA_TYPE, ratio=0) data_loader.save_preprocessed_data(old_train_dict_path, train_data_mat_dict) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* data_generator = Batch_Generator(train_data_mat_dict, batch_size=BATCH_SIZE, data_type=DATA_TYPE, isshuffle=is_shuffle) print('train_data_set_length:', len(train_dataset)) print('train_data_mat_dict_length:', train_data_mat_dict['cha_matrix'].shape) all_param = list(self.named_parameters()) bert_param = [p for n, p in all_param if 'bert' in n] other_param = [p for n, p in all_param if 'bert' not in n] if bert_finetune: optimizer_group_paramters = [{ 'params': other_param, 'lr': LEARNING_RATE_upper }, { 'params': bert_param, 'lr': LEARNING_RATE_bert }] optimizer = torch.optim.Adam(optimizer_group_paramters) log( f'****BERT_finetune, learning_rate_upper: {LEARNING_RATE_upper}, learning_rate_bert: {LEARNING_RATE_bert}', 0) else: optimizer = torch.optim.Adam(other_param, lr=LEARNING_RATE_upper) log(f'****BERT_fix, learning_rate_upper: {LEARNING_RATE_upper}', 0) # ##TODO: scheduler = LambdaLR(optimizer, lr_lambda=my_lr_lambda) # # scheduler = transformers.optimization.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(EPOCH*0.2), num_training_steps=EPOCH) all_cnt = len(train_data_mat_dict['cha_matrix']) log(f'{model_name} Training start!', 0) loss_record = [] score_record = [] max_score = -1 evel_param = { 'batch_size': 100, 'issave': False, 'result_dir': result_dir } for epoch in range(EPOCH): self.train() log(f'EPOCH: {epoch+1}/{EPOCH}', 0) loss = 0.0 for cnt, data_batch in enumerate(data_generator): x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch loss_avg = self._loss(x, reltype, y_rel, lens) optimizer.zero_grad() loss_avg.backward() optimizer.step() loss += loss_avg if use_cuda: loss_record.append(loss_avg.cpu().item()) else: loss_record.append(loss_avg.item()) if (cnt + 1) % visualize_length == 0: loss_cur = loss / visualize_length log( f'[TRAIN] step: {(cnt+1)*BATCH_SIZE}/{all_cnt} | loss: {loss_cur:.4f}', 1) loss = 0.0 # self.eval() # print(data_list[0]['input']) # pre_paths = self._output(x, reltype, lens) # print('predict-path') # print(pre_paths[0]) # print('target-path') # print(y_rel[0]) # self.train() temp_score = self.eval_model(data_loader, data_set=eval_dataset, hyper_param=evel_param, use_cuda=use_cuda) score_record.append(temp_score) scheduler.step() if temp_score[2] > max_score: max_score = temp_score[2] save_path = os.path.join(result_dir, model_name) self.save_model(save_path) print( f'Checkpoint saved successfully, current best socre is {max_score}' ) log(f'the best score of the model is {max_score}') return loss_record, score_record @torch.no_grad() def predict(self, data_loader, data_set=None, hyper_param={}, use_cuda=None, rebuild=False): ''' 预测出 test_data_mat_dict['y_ent_matrix']中的内容,重新填写进该matrix, 未预测之前都是0 :param @data_loader: (KGDataLoader), @hyper_param: (dict) @hyper_param['batch_size'] ##默认4 @hyper_param['issave'] ##默认False @hyper_param['result_dir'] ##默认None :return @result: list, len(句子个数) case = result[0] case['input'] case['relation_list'] r = case['relation_list'][0] r['relation']: 成立日期 r['head']: '百度' r['tail']: '2016年04月08日' ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda if use_cuda: print('use cuda=========================') self.cuda() BATCH_SIZE = hyper_param.get('batch_size', 100) ISSAVE = hyper_param.get('issave', False) result_dir = hyper_param.get('result_dir', './result/') DATA_TYPE = 'rel' test_dataset = data_loader.dataset.test_dataset if data_set is None else data_set if rebuild: test_data_mat_dict = data_loader.transform(test_dataset, istest=True, data_type=DATA_TYPE) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* else: old_test_dict_path = os.path.join(result_dir, 'test_data_mat_dict.pkl') if os.path.exists(old_test_dict_path): test_data_mat_dict = data_loader.load_preprocessed_data( old_test_dict_path) log('Reload preprocessed data successfully~') else: test_data_mat_dict = data_loader.transform(test_dataset, istest=True, data_type=DATA_TYPE, ratio=0) data_loader.save_preprocessed_data(old_test_dict_path, test_data_mat_dict) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* print('test_dataset_length:', len(test_dataset)) print('test_data_mat_dict_length:', test_data_mat_dict['cha_matrix'].shape) data_generator = Batch_Generator(test_data_mat_dict, batch_size=BATCH_SIZE, data_type=DATA_TYPE, isshuffle=False) self.eval() #disable dropout layer and the bn layer total_output_rel = [] all_cnt = len(test_data_mat_dict['cha_matrix']) log(f'Predict start!', 0) for cnt, data_batch in enumerate(data_generator): x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch pre_paths = self._output( x, reltype, lens) ##pre_paths, (batch_size, T), torch.tensor if use_cuda: pre_paths = pre_paths.data.cpu().numpy().astype(np.int) else: pre_paths = pre_paths.data.numpy().astype(np.int) total_output_rel.append(pre_paths) if (cnt + 1) % 10 == 0: log(f'[PREDICT] step {(cnt+1)*BATCH_SIZE}/{all_cnt}', 1) ## add mask when the ent seq idx larger than sentance length pred_output = np.vstack( total_output_rel) ###(N, max_length), numpy.array len_list = test_data_mat_dict['sentence_length'] ###(N), list pred_output = self._padding_mask(pred_output, len_list[:len(pred_output)]) ## transform back to the dict form test_data_mat_dict['y_rel_matrix'] = pred_output result = data_loader.transform_back(test_data_mat_dict, data_type=DATA_TYPE) ## save the result if ISSAVE and result_dir: save_file = os.path.join(result_dir, 'predict.json') with open(save_file, 'w') as f: for data in result: temps = json.dumps(data, ensure_ascii=False) f.write(temps + '\n') log(f'save the predict result in {save_file}') print('final predict length:', len(result)) return result @torch.no_grad() def eval_model(self, data_loader, data_set=None, hyper_param={}, use_cuda=None, rebuild=False): ''' :param @data_loader: (KGDataLoader), @hyper_param: (dict) @hyper_param['batch_size'] #默认64 @hyper_param['issave'] ##默认False @hyper_param['result_dir'] ##默认./result WARNING:可能报错如果result目录不存在的话 :return @precision_s, @recall_s, @f1_s ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda if use_cuda: print('use cuda=========================') self.cuda() def dict2str(d): ## 将entity 从字典形式转化为str形式方便比较 # res = d['entity']+':'+d['entity_type']+':'+str(d['entity_index']['begin'])+'-'+str(d['entity_index']['end']) ## 将relation 从字典形式转化为str形式方便比较 res = d['relation'] + '-' + d['head'] + '-' + d['tail'] return res def calculate_f1(pred_cnt, tar_cnt, correct_cnt): precision_s = round(correct_cnt / (pred_cnt + 1e-8), 3) recall_s = round(correct_cnt / (tar_cnt + 1e-8), 3) f1_s = round( 2 * precision_s * recall_s / (precision_s + recall_s + 1e-8), 3) return precision_s, recall_s, f1_s eva_data_set = data_loader.dataset.dev_dataset if data_set is None else data_set pred_result = self.predict( data_loader, eva_data_set, hyper_param, use_cuda, rebuild=rebuild) ###list(dict), 预测结果 len=n_sentence target = eva_data_set ###list(dict) AutoKGDataset, 真实结果 pred_cnt = 0 tar_cnt = 0 correct_cnt = 0 cnt_all = len(eva_data_set) log('Eval start') for idx in range(cnt_all): sentence = pred_result[idx]['input'] pred_list = pred_result[idx]['relation_list'] tar_list = target[idx]['output']['relation_list'] str_pred_set = set(map(dict2str, pred_list)) str_tar_set = set(map(dict2str, tar_list)) common_set = str_pred_set.intersection(str_tar_set) # print('target:') # print(str_tar_set) # print('predict:') # print(str_pred_set) pred_cnt += len(str_pred_set) tar_cnt += len(str_tar_set) correct_cnt += len(common_set) if (idx + 1) % 1000 == 0: precision_s, recall_s, f1_s = calculate_f1( pred_cnt, tar_cnt, correct_cnt) log( f'[EVAL] step {idx+1}/{cnt_all} | precision: {precision_s} | recall: {recall_s} | f1 score: {f1_s}', 1) precision_s, recall_s, f1_s = calculate_f1(pred_cnt, tar_cnt, correct_cnt) print('=' * 100) log( f'[FINAL] | precision: {precision_s} | recall: {recall_s} | f1 score: {f1_s}', 0) print('=' * 100) return (precision_s, recall_s, f1_s)
class BERT_CRF(MODEL_TEMP): def __init__(self, config={}, show_param=False): ''' :param - dict param['embedding_dim'] param['hidden_dim'] param['n_ent_tags'] param['n_rel_tags'] param['n_words'] param['start_ent_idx'] int, <start> tag index for entity tag seq param['end_ent_idx'] int, <end> tag index for entity tag seq param['start_rel_idx'] param['end_rel_idx'] param['use_cuda'] param['dropout_prob'] param['lstm_layer_num'] ''' super(BERT_CRF, self).__init__() self.config = config self.embedding_dim = self.config.get('embedding_dim', 768) self.n_tags = self.config['n_ent_tags'] # self.n_words = self.config['n_words'] # self.dropout_prob = self.config.get('dropout_prob', 0) self.use_cuda = self.config['use_cuda'] self.model_type = 'BERT_CRF' self.build_model() self.reset_parameters() if show_param: self.show_model_param() def show_model_param(self): log('=' * 80, 0) log(f'model_type: {self.model_type}', 1) log(f'use_cuda: {self.use_cuda}', 1) log(f'embedding_dim: {self.embedding_dim}', 1) log(f'n_ent_tags: {self.n_tags}', 1) log(f"crf_start_idx: {self.config['start_ent_idx']}", 1) log(f"crf_end_idx: {self.config['end_ent_idx']}", 1) # log(f'dropout_prob: {self.dropout_prob}', 1) log('=' * 80, 0) def build_model(self): ''' build the embedding layer, lstm layer and CRF layer ''' self.hidden2tag = nn.Linear(self.embedding_dim, self.n_tags) crf_config = { 'n_tags': self.config['n_ent_tags'], 'start_idx': self.config['start_ent_idx'], 'end_idx': self.config['end_ent_idx'], 'use_cuda': self.use_cuda } self.crf = CRF(crf_config) self.bert = transformers.BertModel.from_pretrained('bert-base-chinese') def reset_parameters(self): I.xavier_normal_(self.hidden2tag.weight.data) self.crf.reset_parameters() def _get_features(self, x, lens, use_cuda=None): ''' :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @lens: 每个句子的实际长度 (batch_size) :return @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda batch_size, T = x.shape ##bert layer words_tensor = self._to_tensor(x, use_cuda) #(batch_size, T) lens = self._to_tensor(lens, use_cuda) att_mask = self._generate_mask(lens, max_len=T) embeds = self.bert( words_tensor, attention_mask=att_mask)[0] #(batch_size, T, n_embed) ##FC layer feature = self.hidden2tag(embeds) #(batch_size, T, n_tags) feature = torch.tanh(feature) # print(feature.shape) return feature def _loss(self, x, y_ent, lens, use_cuda=None): ''' loss function: neg_log_likelihood :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @y_ent: (batch_size, T), np.array, index之后的entity seq, 字符级别, @lens: (batch_size), list, 具体每个句子的长度, :return @loss: (1), torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_features(x, lens) log_norm_score = self.crf.log_norm_score(logits, lens) path_score = self.crf.path_score(logits, y_ent, lens) loss = log_norm_score - path_score ##(batch_size, ) loss = (loss / self._to_tensor(lens, use_cuda)).mean() return loss def _output(self, x, lens, use_cuda=None): ''' return the crf decode paths :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @lens: (batch_size), list, 具体每个句子的长度, :return @paths: (batch_size, T+1), torch.tensor, 最佳句子路径 @scores: (batch_size), torch.tensor, 最佳句子路径上的得分 ''' # self.eval() use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_features(x, lens, use_cuda) scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda) return paths
class BLSTM_CRF(MODEL_TEMP): def __init__(self, config={}, show_param=False): ''' :param - dict param['embedding_dim'] param['hidden_dim'] param['n_tags'] param['n_words'] param['start_idx'] int, <start> tag index for entity tag seq param['end_idx'] int, <end> tag index for entity tag seq param['use_cuda'] param['dropout_prob'] param['lstm_layer_num'] ''' super(BLSTM_CRF, self).__init__() self.config = config self.embedding_dim = self.config.get('embedding_dim', 768) #TODO: 64, 768 self.hidden_dim = self.config.get('hidden_dim', 64) #TODO: 128*2, 64 assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even' self.n_tags = self.config.get('n_ent_tags', 45) self.n_words = self.config.get('n_words', 10000) self.dropout_prob = self.config.get('dropout_prob', 0) self.lstm_layer_num = self.config.get('lstm_layer_num', 1) self.use_cuda = self.config.get('use_cuda', False) self.model_type = 'BLSTM_CRF' self.build_model() self.reset_parameters() if show_param: self.show_model_param() def show_model_param(self): log('=' * 80, 0) log(f'model_type: {self.model_type}', 1) log(f'use_cuda: {self.use_cuda}', 1) log(f'embedding_dim: {self.embedding_dim}', 1) log(f'hidden_dim: {self.hidden_dim}', 1) log(f'lstm_layer_num: {self.lstm_layer_num}', 1) log(f'dropout_prob: {self.dropout_prob}', 1) log(f'n_ent_tags: {self.n_tags}', 1) log(f"crf_start_idx: {self.config['start_ent_idx']}", 1) log(f"crf_end_idx: {self.config['end_ent_idx']}", 1) log('=' * 80, 0) def build_model(self): ''' build the embedding layer, lstm layer and CRF layer ''' self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim) self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, batch_first=True, num_layers=self.lstm_layer_num, dropout=self.dropout_prob, bidirectional=True) self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags) crf_config = { 'n_tags': self.config['n_ent_tags'], 'start_idx': self.config['start_ent_idx'], 'end_idx': self.config['end_ent_idx'], 'use_cuda': self.use_cuda } self.crf = CRF(crf_config) def reset_parameters(self): I.xavier_normal_(self.word_embeds.weight.data) self.lstm.reset_parameters() # stdv = 1.0 / math.sqrt(self.hidden_dim) # for weight in self.lstm.parameters(): # I.uniform_(weight, -stdv, stdv) I.xavier_normal_(self.hidden2tag.weight.data) self.crf.reset_parameters() def _get_lstm_features(self, x, use_cuda=None): ''' :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array :return @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda batch_size = x.shape[0] ##embedding layer words_tensor = self._to_tensor(x, use_cuda) #(batch_size, T) embeds = self.word_embeds(words_tensor) #(batch_size, T, n_embed) ##LSTM layer if use_cuda: h_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2).cuda() #(n_layer*n_dir, N, n_hid) c_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2).cuda() else: h_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2) c_0 = torch.randn(2 * self.lstm_layer_num, batch_size, self.hidden_dim // 2) # c_0 = h_0.clone() hidden = (h_0, c_0) lstm_out, _hidden = self.lstm( embeds, hidden) #(batch_size, T, n_dir*n_hid), (h, c) ##FC layer lstm_feature = self.hidden2tag(lstm_out) #(batch_size, T, n_tags) lstm_feature = torch.tanh(lstm_feature) return lstm_feature def _loss(self, x, y_ent, lens, use_cuda=None): ''' loss function: neg_log_likelihood :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @y_ent: (batch_size, T), np.array, index之后的entity seq, 字符级别, @lens: (batch_size), list, 具体每个句子的长度, :return @loss: (batch_size), torch.tensor ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_lstm_features(x) log_norm_score = self.crf.log_norm_score(logits, lens) path_score = self.crf.path_score(logits, y_ent, lens) loss = log_norm_score - path_score loss = (loss / self._to_tensor(lens, use_cuda)).mean() return loss def _output(self, x, lens, use_cuda=None): ''' return the crf decode paths :param @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array @lens: (batch_size), list, 具体每个句子的长度, :return @paths: (batch_size, T), torch.tensor, 最佳句子路径 @scores: (batch_size), torch.tensor, 最佳句子路径上的得分 ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda logits = self._get_lstm_features(x, use_cuda) scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda) return paths
class BERT(nn.Module): def __init__(self, num_labels, use_crf, tag_pad_idx, pad_token, tag_names): ''' bert sequence classifier num_labels: number of output classes use_crf: switch for using conditional random field (reduces probability of invalid tagging sequences) tag_pad_idx: index for tag padding token pad_token: pad token tag_names: the names of all of the tags in the tag field ''' super().__init__() self.num_labels = num_labels self.use_crf = use_crf self.tag_pad_idx, self.pad_token, self.tag_names = tag_pad_idx, pad_token, tag_names self.build_model_layers() self.init_weights() def build_model_layers(self): ''' builds the layers in the model ''' self.bert = BertForTokenClassification.from_pretrained( "bert-base-cased", num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) if self.use_crf: self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names) def forward(self, sentence, attention_mask, tags): ''' forward operation for network ''' outputs = self.bert(sentence, token_type_ids=None, attention_mask=attention_mask, labels=tags) loss, logits = outputs[0], outputs[1] if self.use_crf: # remove first token id in each sentence (to make crf mask work) # crf_out, crf_loss = self.crf(logits, tags) crf_out, crf_loss = self.crf(logits[:, 1:], tags[:, 1:]) return crf_out, crf_loss else: return logits, loss def init_weights(self): ''' initializes model weights ''' # param_initializer = list(self.bert.classifier.named_parameters()) # if self.crf: # param_initializer += list(self.crf.named_parameters()) # for name, param in param_initializer: # nn.init.normal_(param.data, mean=0, std=0.1) # only initialize conditional random field weights if self.crf: for name, param in self.crf.named_parameters(): nn.init.normal_(param.data, mean=0, std=0.1) def count_parameters(self): ''' counts model parameters ''' return sum(p.numel() for p in self.parameters() if p.requires_grad)
files_t = [] for i in range(10): files_t.append('test'+str(i+1)+'.nn.ner') for i in range(10): print('file ' + str(i+1)) fw = open(dirs + files_t[i] + '.pipe_crf6', 'w') print('load train') doc, label, sparse, trans_tr = load_train_data_pipe(dirs+files[i], vocab_w2i, sen_len, sparse_len, crf_num, label_num, label_onehot) print('load test') doc_t, label_t, sparse_t, trans_t = load_train_data_pipe(dirs+files_t[i], vocab_w2i, sen_len, sparse_len, crf_num, label_num, label_onehot) with tf.Graph().as_default(): sess = tf.Session() with sess.as_default(): crf = CRF(sen_len, label_num, sparse_len, crf_num, learning_rate, label_m, trans_tr) sess.run(tf.initialize_all_variables()) def train_step(input_, label_): feed_dict = { crf.input : input_, crf.label : label_ } _, lss = sess.run([crf.trains, crf.loss], feed_dict) def test_step(input_, label_, fw, trans): totals_ = 0 corrects_ = 0 feed_dict = {crf.input : input_, crf.label : label_} unary_score, lens, _ = sess.run([crf.unary_score, crf.lens, crf.trains], feed_dict) for unary_, l_, lens_ in zip(unary_score, label_, lens): u = unary_[:lens_] l = l_[:lens_]
# print information about datasets print('train set: {} sentences'.format(len(corpus.train_set))) print('valid set: {} sentences'.format(len(corpus.valid_set))) print('test set: {} sentences'.format(len(corpus.valid_set))) print(m * '-') # parameters from corpus text_pad_idx = corpus.text_pad_idx text_unk_idx = corpus.text_unk_idx char_pad_idx = corpus.char_pad_idx tag_pad_idx = corpus.tag_pad_idx pad_token = corpus.pad_token pretrained_embeddings = corpus.text_field.vocab.vectors try: CRF(tag_pad_idx, pad_token, tag_names) use_crf = True print('using crf for models') except: use_crf = False print('not using crf for models (incompatible tagging format)') print(m * '-') # shared cnn parameters char_embedding_dim = 37 char_filter = 4 char_kernel = 3 # shared dropouts embedding_dropout_ratio = 0.5 char_embedding_dropout_ratio = 0.25 cnn_dropout_ratio = 0.25