class Evaluator(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.model_type = model_conf.model if self.model_type == 'bert_seq': self.model = BertClassifier(model_conf) self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) self.ds = SentimentDataset if self.model_type == 'GPT2': self.model = GPT2Classifier(model_conf) self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name) self.ds = GPT2Dataset self.model.load_state_dict(torch.load(train_conf.model_path)) self.device = train_conf.device self.class_num = model_conf.class_num self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) def run(self, batch_size=64): test_path = self.conf.train_info.test_path test_loader = self.get_data_loader(test_path, batch_size) acc, recall, f1_score, cm, report, res = self.evaluate(test_loader) print(f"Accuracy score of the model is {acc}") print(f"Recall score of the model is {recall}") print(f"F1 score of the model is {f1_score}") print(f"Confusion matrix of the model is {cm}") print(report) dir_ = os.path.dirname(test_path) dir_ = os.path.dirname(dir_) dir_ = os.path.split(dir_)[0] new_path = os.path.join(dir_, 'logs', 'bad_case.json') f = open(new_path, 'w') for i in res: print(json.dumps(i, ensure_ascii=False), file=f) def evaluate(self, _loader): self.model.eval() y_true = list() y_pred = list() res = [] with torch.no_grad(): for batch in _loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) y = batch['labels'] y = torch.squeeze(y, 1) y = y.to(self.device) logits = self.model(input_ids, attention_mask) y_true.append(y) y_pred.append(logits) pred_labels = torch.argmax(logits, dim=1) preds = pred_labels.cpu().numpy() true = batch['labels'].squeeze().numpy() if len(true) < 1: continue for i, c_y in enumerate(true): if c_y != preds[i]: tmp_dict = { 'true_label': self.id2label[c_y], 'pred_label': self.id2label[preds[i]], 'text': batch['text'][i] } res.append(tmp_dict) y_true = torch.cat(y_true) y_pred = torch.cat(y_pred) cm = metrics.cal_cm(y_true, y_pred) acc_score = metrics.cal_accuracy(y_true, y_pred) recall = metrics.cal_recall(y_true, y_pred) f1_score = metrics.cal_f1(y_true, y_pred) label_range = [i for i in range(len(self.label_map))] target_name = [ x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1]) ] report = metrics.get_classification_report(y_true, y_pred, label_range, target_name) return acc_score, recall, f1_score, cm, report, res def get_data_loader(self, f_path, batch_size): np.random.seed(14) texts, labels = prepare(f_path, self.label_map) ds = self.ds(self.tokenizer, texts, labels, self.max_len) return dataloader.DataLoader(ds, batch_size=batch_size, num_workers=self.conf.num_workers, shuffle=True)
def main(): # 参数设置 batch_size = 4 device = 'cuda' if torch.cuda.is_available() else 'cpu' epochs = 10 learning_rate = 5e-6 #Learning Rate不宜太大 # 获取到dataset train_dataset = CNewsDataset('data/cnews/cnews.train.txt') valid_dataset = CNewsDataset('data/cnews/cnews.val.txt') #test_data = load_data('cnews/cnews.test.txt') # 生成Batch train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) #test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False) # 读取BERT的配置文件 bert_config = BertConfig.from_pretrained('bert-base-chinese') num_labels = len(train_dataset.labels) # 初始化模型 model = BertClassifier(bert_config, num_labels).to(device) optimizer = AdamW(model.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss() best_acc = 0 for epoch in range(1, epochs + 1): losses = 0 # 损失 accuracy = 0 # 准确率 model.train() train_bar = tqdm(train_dataloader) for input_ids, token_type_ids, attention_mask, label_id in train_bar: model.zero_grad() train_bar.set_description('Epoch %i train' % epoch) output = model( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), token_type_ids=token_type_ids.to(device), ) loss = criterion(output, label_id.to(device)) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_id.to(device)).item() / len( pred_labels) #acc accuracy += acc loss.backward() optimizer.step() train_bar.set_postfix(loss=loss.item(), acc=acc) average_loss = losses / len(train_dataloader) average_acc = accuracy / len(train_dataloader) print('\tTrain ACC:', average_acc, '\tLoss:', average_loss) # 验证 model.eval() losses = 0 # 损失 accuracy = 0 # 准确率 valid_bar = tqdm(valid_dataloader) for input_ids, token_type_ids, attention_mask, label_id in valid_bar: valid_bar.set_description('Epoch %i valid' % epoch) output = model( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), token_type_ids=token_type_ids.to(device), ) loss = criterion(output, label_id.to(device)) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_id.to(device)).item() / len( pred_labels) #acc accuracy += acc valid_bar.set_postfix(loss=loss.item(), acc=acc) average_loss = losses / len(valid_dataloader) average_acc = accuracy / len(valid_dataloader) print('\tValid ACC:', average_acc, '\tLoss:', average_loss) if average_acc > best_acc: best_acc = average_acc torch.save(model.state_dict(), 'models/best_model.pkl')
progress_bar.set_postfix(postfix) if (args.eval_every_n_batches > 0 and i % args.eval_every_n_batches == 0 and len(train_dataloader) - i >= args.eval_every_n_batches // 2) or\ i == len(train_dataloader): dev_metrics = initialize_metrics() dev_progress_bar = tqdm.tqdm(dev_dataloader) for j, batch in enumerate(dev_progress_bar): outputs = bert_classifier.validate_on_batch(batch) postfix = update_metrics(dev_metrics, outputs, batch["labels"]) dev_progress_bar.set_postfix(postfix) if dev_metrics["accuracy"] > best_score: best_score = dev_metrics["accuracy"] best_weights = copy.deepcopy(bert_classifier.state_dict()) bert_classifier.load_state_dict(best_weights) ## загружаем наилучшее состояние bert_classifier.eval() if args.save_file is not None: torch.save(best_weights, args.save_file) probs, labels = [None] * len(dev_data), [None] * len(dev_data) dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False) dev_progress_bar = tqdm.tqdm(dev_dataloader) for i, batch in enumerate(dev_progress_bar): outputs = bert_classifier.predict_on_batch(batch) for index, prob, label in zip(batch["index"], outputs["probs"], outputs["labels"]): probs[index], labels[index] = prob, label corr_labels = [int(elem[args.answer_field]==args.pos_label) for elem in dev_data] accuracy = accuracy_score(corr_labels, labels) metrics = precision_recall_fscore_support(corr_labels, labels) print("Accuracy: {:.2f}".format(100 * accuracy)) for key, value in zip(["Precision", "Recall", "F1"], metrics): print("{}: Negative {:.2f}, Positive {:.2f}".format(key, *(list(100 * value))))
attention_mask = (x != 0).float().to(config.DEVICE).long() outputs = MODEL(x, attention_mask=attention_mask) return outputs.cpu().detach().numpy() @app.route('/predict') def predict(): comment = request.args.get('comment') start_time = time.time() prediction = comment_prediction(comment) response = { 'response': { label: str(prob) for label, prob in zip(config.CLASS_COLS, prediction[0]) } } response['response']['comment'] = comment response['response']['time_taken'] = str(time.time() - start_time) return flask.jsonify(response) if __name__ == '__main__': bert_config = BertConfig.from_pretrained(config.BERT_NAME) bert_config.num_labels = config.NUM_CLASSES MODEL = BertClassifier(bert_config) MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH)) MODEL.to(config.DEVICE) MODEL.eval() app.run(host=config.HOST, port=config.PORT)
def main(): device = torch.device('cuda:3') # 获取到dataset print('加载训练数据') train_data = load_data('dataset/train.csv') print('加载验证数据') valid_data = load_data('dataset/test.csv') # test_data = load_data('cnews/cnews.test.txt') batch_size = 16 # 生成Batch print('生成batch') train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=3) valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=3) # test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False) # 读取BERT的配置文件 bert_config = BertConfig.from_pretrained('./chinese_wwm_pytorch') bert_config.num_labels = num_labels print(bert_config) # 初始化模型 model = BertClassifier(bert_config) # model.to(device) # 参数设置 EPOCHS = 20 learning_rate = 5e-6 # Learning Rate不宜太大 optimizer = AdamW(model.parameters(), lr=learning_rate) # 损失函数采用交叉熵 criterion = nn.CrossEntropyLoss() with open('output.txt', 'w') as wf: wf.write('Batch Size: ' + str(batch_size) + '\tLearning Rate: ' + str(learning_rate) + '\n') best_acc = 0 # 设置并行训练,模型默认是把参数放在device[0]对应的gpu编号的gpu上,所以这里应该和上面设置的cuda:2对应 net = torch.nn.DataParallel(model, device_ids=[3, 4]) net.to(device) # model.module.avgpool = nn.AdaptiveAvgPool2d(7) # 开始训练 for Epoch in range(1, EPOCHS + 1): losses = 0 # 损失 accuracy = 0 # 准确率 print('Epoch:', Epoch) model.train() for batch_index, batch in enumerate(train_dataloader): # print(batch_index) # print(batch) input_ids = batch[0].to(device) attention_mask = batch[1].to(device) token_type_ids = batch[2].to(device) label_ids = batch[3].to(device) # 将三个输入喂到模型中 output = net( # forward input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, ) loss = criterion(output, label_ids) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_ids.to(device)).item() / len( pred_labels) # acc accuracy += acc # 打印训练过程中的准确率以及loss # print('Epoch: %d | Train: | Batch: %d / %d | Acc: %f | Loss: %f' % (Epoch, batch_index + 1, len(train_dataloader), acc, loss.item())) # 模型梯度置零,损失函数反向传播,优化更新 model.zero_grad() loss.backward() optimizer.step() # torch.cuda.empty_cache() average_loss = losses / len(train_dataloader) average_acc = accuracy / len(train_dataloader) # 打印该epoch训练结果的 print('\tTrain ACC:', average_acc, '\tLoss:', average_loss) # with open('output.txt', 'a') as rf: # output_to_file = '\nEpoch: ' + str(Epoch) + '\tTrain ACC:' + str(average_acc) + '\tLoss: ' + str( # average_loss) # rf.write(output_to_file) # 验证 model.eval() losses = 0 # 损失 accuracy = 0 # 准确率 # 在验证集上进行验证 for batch_index, batch in enumerate(valid_dataloader): input_ids = batch[0].to(device) attention_mask = batch[1].to(device) token_type_ids = batch[2].to(device) label_ids = batch[3].to(device) with torch.no_grad(): output = model( # forward input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, ) loss = criterion(output, label_ids) losses += loss.item() # 这里的两部操作都是直接对生成的结果张量进行操作 pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_ids.to(device)).item() / len( pred_labels) # acc accuracy += acc average_loss = losses / len(valid_dataloader) average_acc = accuracy / len(valid_dataloader) print('\tValid ACC:', average_acc, '\tLoss:', average_loss) # with open('output.txt', 'a') as rf: # output_to_file = '\nEpoch: ' + str(Epoch) + '\tValid ACC:' + str(average_acc) + '\tLoss: ' + str( # average_loss) + '\n' # rf.write(output_to_file) if average_acc > best_acc: best_acc = average_acc torch.save(model.state_dict(), 'best_model_on_trainset.pkl')
def run(): def collate_fn( batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) -> Tuple[torch.LongTensor, torch.LongTensor]: x, y = list(zip(*batch)) x = pad_sequence(x, batch_first=True, padding_value=0) y = torch.stack(y) return x.to(device), y.to(device) df = pd.read_csv("../inputs/Train.csv") # test = pd.read_csv("../inputs/Test.csv") train_df, val_df = train_test_split(df, stratify=df.label, test_size=VALID_SIZE, random_state=SEED) labels = ["Depression", "Alcohol", "Suicide", "Drugs"] train = pd.concat([train_df["text"], pd.get_dummies(train_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) valid = pd.concat([val_df["text"], pd.get_dummies(val_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) if DEVICE == 'cpu': print('cpu') else: n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) train_dataset = MentalHealthDataset(config.TOKENIZER, train, lazy=True) valid_dataset = MentalHealthDataset(config.TOKENIZER, valid, lazy=True) collate_fn = partial(collate_fn, device=DEVICE) train_sampler = RandomSampler(train_dataset) valid_sampler = RandomSampler(valid_dataset) train_iterator = DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn) valid_iterator = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, sampler=valid_sampler, collate_fn=collate_fn) # model = BertClassifier().to(DEVICE) model = BertClassifier(BertModel.from_pretrained(config.BERT_PATH), 4).to(DEVICE) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # triangular learning rate, linearly grows untill half of first epoch, then linearly decays warmup_steps = 10**3 # 10 ** 3 total_steps = len(train_iterator) * config.EPOCHS - warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) # optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 1e-4) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", # patience=5, factor=0.3, min_lr=1e-10, verbose=True) for epoch in range(config.EPOCHS): print('=' * 5, f"EPOCH {epoch}", '=' * 5) engine.train_fn(train_iterator, model, optimizer, scheduler) engine.eval_fn(valid_iterator, model) model.eval() test_df = pd.read_csv("../inputs/Test.csv") submission = pd.read_csv('../inputs/SampleSubmission.csv') res = np.zeros((submission.shape[0], len(labels))) for i in tqdm(range(len(test_df) // config.TRAIN_BATCH_SIZE + 1)): batch_df = test_df.iloc[i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE] assert (batch_df["ID"] == submission["ID"] [i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE]).all(), f"Id mismatch" texts = [] for text in batch_df["text"].tolist(): text = config.TOKENIZER.encode(text, add_special_tokens=True) if len(text) > config.MAX_LEN: text = text[:config.MAX_LEN - 1] + [config.TOKENIZER.sep_token_id] texts.append(torch.LongTensor(text)) x = pad_sequence( texts, batch_first=True, padding_value=config.TOKENIZER.pad_token_id).to(DEVICE) mask = (x != config.TOKENIZER.pad_token_id).float().to(DEVICE) with torch.no_grad(): _, outputs = model(x, attention_mask=mask) outputs = outputs.cpu().numpy() submission.loc[i * config.TRAIN_BATCH_SIZE:(i * config.TRAIN_BATCH_SIZE + len(outputs) - 1), labels] = outputs submission.to_csv("../subs/submission_2.csv", index=False)