def main(config,model_times, myProcessor, model_config = None): # print(os.path.join(config.output_dir, model_times)) if not os.path.exists(os.path.join(config.output_dir , model_times)): os.makedirs(os.path.join(config.output_dir , model_times)) if not os.path.exists(os.path.join(config.cache_dir , model_times)): os.makedirs(os.path.join(config.cache_dir , model_times)) output_model_file = os.path.join(config.output_dir, model_times, WEIGHTS_NAME) # 模型输出文件 output_config_file = os.path.join(config.output_dir, model_times, CONFIG_NAME) gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) # 设备准备 if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps """ 设定随机种子 """ random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) """ 数据准备 """ processor = myProcessor() # 整个文件的代码只需要改此处即可 tokenizer = BertTokenizer.from_pretrained( config.bert_vocab_file, do_lower_case=config.do_lower_case) # 分词器选择 label_list = processor.get_labels() num_labels = len(label_list) if True: train_dataloader, train_examples_len = load_data( config.data_dir, tokenizer, processor, config.max_seq_length, config.train_batch_size, "train") dev_dataloader, _ = load_data( config.data_dir, tokenizer, processor, config.max_seq_length, config.dev_batch_size, "dev") num_train_optimization_steps = int( train_examples_len / config.train_batch_size / config.gradient_accumulation_steps) * config.num_train_epochs """ 模型准备 """ print("model name is {}".format(config.model_name)) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in model_config.filter_sizes.split()] model = BertCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=model_config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRNNCNN": from BertRCNN.BertRNNCNN import BertRNNCNN model = BertRNNCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model,device_ids=gpu_ids) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, model, train_dataloader, dev_dataloader, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.eval_every, config.early_stop) if True: """ Test """ test_dataloader, _ = load_data( config.data_dir, tokenizer, processor, config.max_seq_length, config.test_batch_size, "test") bert_config = BertConfig(output_config_file) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(bert_config, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in model_config.filter_sizes.split()] model = BertCNN(bert_config, num_labels=num_labels, n_filters=model_config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRNNCNN": from BertRCNN.BertRNNCNN import BertRNNCNN model = BertRNNCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) # test the model test_loss, test_acc, test_report, test_auc = evaluate( model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print('\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} % | AUC:{test_auc}') for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
def main(config, bert_vocab_file, bert_model_dir): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) output_model_file = os.path.join(config.output_dir, config.weights_name) # 模型输出文件 output_config_file = os.path.join(config.output_dir, config.config_name) # 设备准备 gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps """ 设定随机种子 """ random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) tokenizer = BertTokenizer.from_pretrained( bert_vocab_file, do_lower_case=config.do_lower_case) label_list = ["0", "1", "2", "3"] if config.do_train: # 数据准备 train_file = os.path.join(config.data_dir, "train.json") dev_file = os.path.join(config.data_dir, "dev.json") train_dataloader, train_len = load_data(train_file, tokenizer, config.max_seq_length, config.train_batch_size) dev_dataloader, dev_len = load_data(dev_file, tokenizer, config.max_seq_length, config.dev_batch_size) num_train_steps = int(train_len / config.train_batch_size / config.gradient_accumulation_steps * config.num_train_epochs) # 模型准备 if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin.from_pretrained(bert_model_dir, cache_dir=config.cache_dir, num_choices=4) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=gpu_ids) # 优化器准备 param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, train_dataloader, dev_dataloader, model, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.print_step) test_file = os.path.join(config.data_dir, "test.json") test_dataloader, _ = load_data(test_file, tokenizer, config.max_seq_length, config.test_batch_size) bert_config = BertConfig(output_config_file) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(bert_config, num_choices=len(label_list)) model.load_state_dict(torch.load(output_model_file)) model.to(device) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) test_loss, test_acc, test_report = evaluate(model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print(f'\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} %') for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
def main(config, model_times, label_list): if not os.path.exists(config.output_dir + model_times): os.makedirs(config.output_dir + model_times) if not os.path.exists(config.cache_dir + model_times): os.makedirs(config.cache_dir + model_times) # Bert 模型输出文件 output_model_file = os.path.join(config.output_dir, model_times, WEIGHTS_NAME) output_config_file = os.path.join(config.output_dir, model_times,CONFIG_NAME) # 设备准备 gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps # 设定随机种子 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) # 数据准备 tokenizer = BertTokenizer.from_pretrained( config.bert_vocab_file, do_lower_case=config.do_lower_case) # 分词器选择 num_labels = len(label_list) # Train and dev if config.do_train: train_dataloader, train_examples_len = load_data( config.data_dir, tokenizer, config.max_seq_length, config.train_batch_size, "train", label_list) dev_dataloader, _ = load_data( config.data_dir, tokenizer, config.max_seq_length, config.dev_batch_size, "dev", label_list) num_train_optimization_steps = int( train_examples_len / config.train_batch_size / config.gradient_accumulation_steps) * config.num_train_epochs # 模型准备 print("model name is {}".format(config.model_name)) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == 'BertLSTM': from BertLSTM.BertLSTM import BertLSTM model = BertLSTM.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, rnn_hidden_size=config.hidden_size, num_layers=config.num_layers, bidirectional=config.bidirectional, dropout=config.dropout) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, rnn_hidden_size=config.hidden_size, num_layers=config.num_layers, bidirectional=config.bidirectional, dropout=config.dropout) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertDPCNN": from BertDPCNN.BertDPCNN import BertDPCNN model = BertDPCNN.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, filter_num=config.filter_num) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model,device_ids=gpu_ids) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, model, train_dataloader, dev_dataloader, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.print_step, config.early_stop) """ Test """ # test 数据 test_dataloader, _ = load_data( config.data_dir, tokenizer, config.max_seq_length, config.test_batch_size, "test", label_list) # 加载模型 bert_config = BertConfig(output_config_file) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(bert_config, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == 'BertLSTM': from BertLSTM.BertLSTM import BertLSTM model = BertLSTM(bert_config, num_labels, config.hidden_size, config.num_layers, config.bidirectional, config.dropout) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT(bert_config, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN(bert_config, num_labels, config.hidden_size, config.num_layers, config.bidirectional, config.dropout) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertDPCNN": from BertDPCNN.BertDPCNN import BertDPCNN model = BertDPCNN(bert_config, num_labels=num_labels, filter_num=config.filter_num) model.load_state_dict(torch.load(output_model_file)) model.to(device) # 损失函数准备 criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) # test the model test_loss, test_acc, test_report, test_auc, all_idx, all_labels, all_preds = evaluate_save( model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print(f'\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} % | AUC:{test_auc}') for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
def main(config, bert_vocab_file, do_prediction=False): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) # --gpu_ids: [1,2,3]-- gpu_ids = [int(device_id) for device_id in config.gpu_ids.split(',')] print("gpu_ids:{}".format(gpu_ids)) device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) #label_list = ["0", "1"] criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) if not do_prediction: # 数据准备 train_file = os.path.join(config.data_dir, "train.csv") dev_file = os.path.join(config.data_dir, "valid.csv") train_dataloader, train_len = load_data(train_file, config.batch_size, train=True) print("Num train_set: {}".format(train_len)) valid_train_dataloader, valid_train_len = load_data( train_file, config.batch_size) print("Num valid_train_set: {}".format(valid_train_len)) dev_dataloader, dev_len = load_data(dev_file, config.batch_size) print("Num dev_set: {}".format(dev_len)) num_train_steps = int(train_len / config.batch_size / config.gradient_accumulation_steps * config.num_train_epochs) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(config, num_classes=2) model.to(device) if n_gpu > 1: model = nn.DataParallel(model, device_ids=gpu_ids) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=config.learning_rate, betas=(0.9, 0.999), weight_decay=1e-8, correct_bias=False) # bert里的小技巧, bert里的learning rate是不断变化的,先往上升再往下降,这个scheduler就是用来设置这个 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.num_warmup_steps, num_training_steps=num_train_steps) best_model_state_dict = train( config.num_train_epochs, n_gpu, train_dataloader, dev_dataloader, valid_train_dataloader, model, optimizer, criterion, config.gradient_accumulation_steps, config.max_grad_norm, device, scheduler, config.output_dir) torch.save(best_model_state_dict, config.best_model_file) else: print('---**Enter Test**---') #dev_dataloader, dev_examples, dev_features, dev_labels = dev[:-1] test_file = os.path.join(config.data_dir, "test.csv") test_dataloader, test_len = load_data(test_file, config.batch_size) print('Num test_set: {}'.format(test_len)) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin test_model = BertOrigin(config, num_classes=2) pretrained_model_dict = torch.load(config.best_model_file) new_state_dict = OrderedDict() for k, value in pretrained_model_dict.items(): #name = k[7:] # remove `module.` new_state_dict[k] = value test_model.load_state_dict(new_state_dict, strict=True) test_model.to(device) if n_gpu > 1: test_model = nn.DataParallel(test_model, device_ids=gpu_ids) test_acc, test_f1 = evaluate(test_model, test_dataloader, device) print(f'\t Acc: {test_acc*100: .3f}% | f1: {test_f1*100: .3f}%')