# 查看现在使用的设备 print('current device:', torch.cuda.current_device()) n_gpu = 1 params.n_gpu = n_gpu # Set the random seed for reproducible experiments random.seed(args.seed) torch.manual_seed(args.seed) params.seed = args.seed if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Set the logger utils.set_logger(save=True, log_path=os.path.join(params.params_path, 'train.log')) logging.info("Model type: ") logging.info("device: {}".format(params.device)) logging.info('Init pre-train model...') bert_config = BertConfig.from_json_file(os.path.join(params.bert_model_dir, 'bert_config.json')) model = BertForTokenClassification(config=bert_config, params=params) nezha_utils.torch_init_model(model, os.path.join(params.bert_model_dir, 'pytorch_model.bin')) # 保存bert config model.to(params.device) if params.n_gpu > 1 and args.multi_gpu: model = torch.nn.DataParallel(model) logging.info('-done') # Train and evaluate the model logging.info("Starting training for {} epoch(s)".format(args.epoch_num)) train_and_evaluate(model, params, args.restore_file)
def train(train_iter, test_iter, config): """""" # Prepare model # Prepare model # reload weights from restore_file if specified 如果指定就加载已经训练的权重 if config.pretrainning_model == 'nezha': #哪吒模型 Bert_config = BertConfig.from_json_file(config.bert_config_file) model = BertForTokenClassification(config=Bert_config, params=config) nezha_utils.torch_init_model(model, config.bert_file) elif config.pretrainning_model == 'albert': Bert_config = AlbertConfig.from_pretrained(config.model_path) model = BertForTokenClassification.from_pretrained(config.model_path, config=Bert_config) else: Bert_config = RobertaConfig.from_pretrained(config.bert_config_file, output_hidden_states=True) model = BertForTokenClassification.from_pretrained( config=Bert_config, params=config, pretrained_model_name_or_path=config.model_path) Bert_config.output_hidden_states = True # 获取每一层的输出 model.to(device) """多卡训练""" if n_gpu > 1: model = torch.nn.DataParallel(model) # optimizer # Prepare optimizer # fine-tuning # 取模型权重 param_optimizer = list(model.named_parameters()) # pretrain model param 预训练的参数 param_pre = [(n, p) for n, p in param_optimizer if 'bert' in n or 'electra' in n] # nezha的命名为bert # middle model param 中等参数 param_middle = [ (n, p) for n, p in param_optimizer if not any([s in n for s in ('bert', 'crf', 'electra', 'albert')]) or 'dym_weight' in n ] # crf param # 不进行衰减的权重 no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm'] # 将权重分组 optimizer_grouped_parameters = [ # pretrain model param 预训练的参数 # 衰减 { 'params': [p for n, p in param_pre if not any(nd in n for nd in no_decay)], 'weight_decay': config.decay_rate, 'lr': config.embed_learning_rate }, # 不衰减 { 'params': [p for n, p in param_pre if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': config.embed_learning_rate }, # middle model 中等参数 # 衰减 { 'params': [ p for n, p in param_middle if not any(nd in n for nd in no_decay) ], 'weight_decay': config.decay_rate, 'lr': config.learning_rate }, # 不衰减 { 'params': [p for n, p in param_middle if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': config.learning_rate }, ] num_train_optimization_steps = train_iter.num_records // config.gradient_accumulation_steps * config.train_epoch optimizer = BertAdam(optimizer_grouped_parameters, warmup=config.warmup_proportion, schedule="warmup_cosine", t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Batch size = %d", config.batch_size) logger.info(" Num epochs = %d", config.train_epoch) logger.info(" Learning rate = %f", config.learning_rate) cum_step = 0 timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(config.save_model, "runs_" + str(gpu_id), timestamp)) if not os.path.exists(out_dir): os.makedirs(out_dir) print("Writing to {}\n".format(out_dir)) draw_step_list = [] draw_loss_list = [] for i in range(config.train_epoch): model.train() for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm( train_iter): # 转成张量 loss = model(input_ids=list2ts2device(input_ids_list), token_type_ids=list2ts2device(segment_ids_list), attention_mask=list2ts2device(input_mask_list), labels=list2ts2device(label_ids_list)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. # 梯度累加 if config.gradient_accumulation_steps > 1: loss = loss / config.gradient_accumulation_steps if cum_step % 10 == 0: draw_step_list.append(cum_step) draw_loss_list.append(loss) if cum_step % 100 == 0: format_str = 'step {}, loss {:.4f} lr {:.5f}' print( format_str.format(cum_step, loss, config.learning_rate)) loss.backward() # 反向传播,得到正常的grad if (cum_step + 1) % config.gradient_accumulation_steps == 0: # performs updates using calculated gradients optimizer.step() model.zero_grad() cum_step += 1 p, r, f1 = set_test(model, test_iter) # lr_scheduler学习率递减 step print('dev set : step_{},precision_{}, recall_{}, F1_{}'.format( cum_step, p, r, f1)) # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( os.path.join( out_dir, 'model_{:.4f}_{:.4f}_{:.4f}_{}.bin'.format( p, r, f1, str(cum_step)))) torch.save(model_to_save, output_model_file) with open(Config().processed_data + 'step_loss_data.pickle', 'wb') as mf: draw_dict = {'step': draw_step_list, 'loss': draw_loss_list} pickle.dump(draw_dict, mf)