def evaluate(model, test_loader, device, criterion, n_class=41): total_iou = 0.0 total_num_data = 0.0 total_loss = 0.0 with torch.no_grad(): for i, data in enumerate(test_loader): x = data['audio'] xlabel = data['label'] num_data = len(xlabel[0]) x = x.to(device) xlabel_enc = two_hot_encode(xlabel[0], xlabel[1], n_dim=n_class) xlabel_enc = xlabel_enc.to(device) out = model(x) logit, pred = out loss = criterion(logit, xlabel_enc) total_iou += calculate_iou(pred, xlabel) total_num_data += num_data total_loss += loss.item() del x, xlabel torch.cuda.empty_cache() logger.info('test loss: {loss:.4f}\ttest iou: {iou:.4f}'.format( loss=total_loss / (i + 1), iou=total_iou / total_num_data)) return total_loss / (i + 1), total_iou / total_num_data
def evaluate(model, test_loader, device, criterion): correct = 0.0 num_data = 0.0 total_loss = 0.0 label = [] prediction = [] with torch.no_grad(): for i, data in enumerate(test_loader): x = data['image'] xlabel = data['label'] x = x.to(device) xlabel = xlabel.to(device) out = model(x) logit, pred = out loss = criterion(logit, xlabel) correct += torch.sum(pred == xlabel).item() num_data += xlabel.size(0) total_loss += loss.item() label = label + xlabel.tolist() prediction = prediction + pred.detach().cpu().tolist() del x, xlabel torch.cuda.empty_cache() f1_array = f1_score(label, prediction, average=None) f1_mean = gmean(f1_array) logger.info( 'test loss: {loss:.4f}\ttest acc: {acc:.4f}\ttest F1: {f1:.4f}'.format( loss=total_loss / (i + 1), acc=correct / num_data, f1=f1_mean)) return total_loss / (i + 1), correct / num_data, f1_mean
def unclassified_predict(model, unclassified_loader, device, n_class=5): predictedData = [[] for i in range(n_class)] lenul = len(unclassified_loader) x2 = None with torch.no_grad(): for i, data in enumerate(unclassified_loader): img_name = data['image_name'] x = data['image'] if "Ensemble" in model.name: x2 = data['image_2'] x2 = x2.to(device) category_oneh = data['category_onehot'] category = data['category'] category = category.to(device) category_oneh = category_oneh.to(device) x = x.to(device) if 'Ensemble' in model.name: out = model(x, x2, category_oneh, category) elif 'Trainable' in model.name: out = model(x, category_oneh, category) else: out = model(x, category_oneh) logit, pred = out for item in zip(img_name, pred, logit): predict = int(item[1]) predictedData[predict].append( (float(item[2][predict]), item[0], predict)) # prob, fname, predict if i % 100 == 0: #작업 경과 출력 logger.info(f'predict unclassied data {i} / {lenul}') return predictedData
def embedding_training(model, train_loader, optimizer, criterion, device, epoch, total_epochs): running_loss = 0.0 total_loss = 0.0 correct = 0.0 category_correct = 0.0 num_data = 0.0 for i, data in enumerate(train_loader): start = time.time() x = data['image'] xlabel = data['label'] category_pos = data['category_possible'] category = data['category'] category_oneh = data['category_onehot'] x = x.to(device) xlabel = xlabel.to(device) category = category.to(device) category_pos = category_pos.to(device) category_oneh = category_oneh.to(device) optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 out = model(x, category_oneh, category) logit, pred = out if isinstance(criterion, torch.nn.CrossEntropyLoss): loss = criterion(logit, xlabel) elif isinstance(criterion, LabelSmoothingLoss): loss = criterion(logit, xlabel, category_pos) loss.backward() optimizer.step() running_loss += loss.item() total_loss += loss.item() category_pred = torch.argmax(logit * category_pos, dim=-1) category_correct += torch.sum(category_pred == xlabel).item() correct += torch.sum(pred == xlabel).item() num_data += xlabel.size(0) if i % 100 == 0: # print every 100 mini-batches logger.info( "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec". format(epoch, total_epochs, i, len(train_loader), running_loss / 2000, time.time() - start)) running_loss = 0.0 logger.info( '[{}/{}]\tloss: {:.4f}\tacc: {:.4f} \tcategory_acc : {:.4f}'.format( epoch, total_epochs, total_loss / (i + 1), correct / num_data, category_correct / num_data)) del x, xlabel torch.cuda.empty_cache() return total_loss / (i + 1), correct / num_data
def rotate_checkpoints(use_mtime=False) -> None: if training_args.save_total_limit is None or training_args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = sorted_checkpoints(use_mtime=use_mtime) if len(checkpoints_sorted) <= training_args.save_total_limit: return number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - training_args.save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) shutil.rmtree(checkpoint)
def train(model, train_loader, optimizer, criterion, device, epoch, total_epochs, n_class=41): running_loss = 0.0 total_loss = 0.0 total_iou = 0.0 total_num_data = 0 for i, data in enumerate(train_loader): start = time.time() x = data['audio'] xlabel = data['label'] num_data = len(xlabel[0]) xlabel_enc = two_hot_encode(xlabel[0], xlabel[1], n_dim=n_class) x = x.to(device) xlabel_enc = xlabel_enc.to(device) optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 out, pred = model(x) logit = out loss = criterion(logit, xlabel_enc) loss.backward() optimizer.step() running_loss += loss.item() total_loss += loss.item() total_iou += calculate_iou(pred, xlabel) total_num_data += num_data if i % 100 == 0: # print every 100 mini-batches logger.info( "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec". format(epoch, total_epochs, i, len(train_loader), running_loss / 2000, time.time() - start)) running_loss = 0.0 logger.info('[{}/{}]\tloss: {:.4f}\tiou: {:.4f}'.format( epoch, total_epochs, total_loss / (i + 1), total_iou / total_num_data)) del x, xlabel torch.cuda.empty_cache() return total_loss / (i + 1), total_iou / total_num_data
def select_samples(labeled_file, unlabeled_file, o_labeled_file, o_unlabeled_file, query_strategy='random'): labeled_file = f'labeled_{labeled_file}.json' unlabeled_file = f'unlabeled_{unlabeled_file}.json' o_labeled_file = f'labeled_{o_labeled_file}.json' o_unlabeled_file = f'unlabeled_{o_unlabeled_file}.json' low_cnt = 5740 d_unlabeled = json.load((Path(data_dir) / unlabeled_file).open()) d_labeled = json.load((Path(data_dir) / labeled_file).open()) # mid = len(d_unlabeled) // 2 # span = (len(d_labeled) + len(d_unlabeled)) // 10 # start = max(0, mid - span) # end = mid + span d_unlabeled = sorted(d_unlabeled, key=lambda x: float(x['score'])) if query_strategy != 'random': low_samples, d_unlabeled = d_unlabeled[:low_cnt], d_unlabeled[low_cnt:] random_samples = [] else: low_samples = [] random_samples = random.sample(d_unlabeled, low_cnt) for _ in random_samples: d_unlabeled.remove(_) d_to_add = low_samples + random_samples def trans(data): for dic in data: dic['label'] = dic['true_label'] dic.pop('pred_label') dic.pop('true_label') dic.pop('score') trans(d_to_add) trans(d_unlabeled) json.dump(d_to_add + d_labeled, (Path(data_dir) / o_labeled_file).open('w'), ensure_ascii=False, indent=2) json.dump(d_unlabeled, (Path(data_dir) / o_unlabeled_file).open('w'), ensure_ascii=False, indent=2) logger.info(f'input: {labeled_file} - {unlabeled_file}') logger.info(f'd_to_add size: {len(d_to_add)}') logger.info(f'd_as_unlabeled size: {len(d_unlabeled)}') logger.info(f'd_to_add + d_labeled size: {len(d_to_add) + len(d_labeled)}')
def train_process(args, model, train_loader, test_loader, optimizer, criterion, device): best_acc = 0.0 for epoch in range(args.num_epoch): model.train() train_loss, train_acc = train(model=model, train_loader=train_loader, optimizer=optimizer, criterion=criterion, device=device, epoch=epoch, total_epochs=args.num_epoch) model.eval() test_loss, test_acc, test_f1 = evaluate(model=model, test_loader=test_loader, device=device, criterion=criterion) report_dict = dict() report_dict["train__loss"] = train_loss report_dict["train__acc"] = train_acc report_dict["test__loss"] = test_loss report_dict["test__acc"] = test_acc report_dict["test__f1"] = test_f1 report_dict["train__lr"] = optimizer.param_groups[0]['lr'] nsml.report(False, step=epoch, **report_dict) if best_acc < test_acc: checkpoint = 'best' logger.info( f'[{epoch}] Find the best model! Change the best model.') nsml.save(checkpoint) best_acc = test_acc if (epoch + 1) % 5 == 0: checkpoint = f'ckpt_{epoch + 1}' nsml.save(checkpoint) if (epoch + 1) % args.annealing_period == 0: for g in optimizer.param_groups: g['lr'] = g['lr'] / args.learning_anneal logger.info( 'Learning rate annealed to : {lr:.6f} @epoch{epoch}'.format( epoch=epoch, lr=optimizer.param_groups[0]['lr']))
def train(model, train_loader, optimizer, criterion, device, epoch, total_epochs): running_loss = 0.0 total_loss = 0.0 correct = 0.0 num_data = 0.0 for i, data in enumerate(train_loader): start = time.time() x = data['image'] xlabel = data['label'] x = x.to(device) xlabel = xlabel.to(device) optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 out = model(x) logit, pred = out loss = criterion(logit, xlabel) loss.backward() optimizer.step() running_loss += loss.item() total_loss += loss.item() correct += torch.sum(pred == xlabel).item() num_data += xlabel.size(0) if i % 100 == 0: # print every 100 mini-batches logger.info( "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec". format(epoch, total_epochs, i, len(train_loader), running_loss / 2000, time.time() - start)) running_loss = 0.0 logger.info('[{}/{}]\tloss: {:.4f}\tacc: {:.4f}'.format( epoch, total_epochs, total_loss / (i + 1), correct / num_data)) del x, xlabel torch.cuda.empty_cache() return total_loss / (i + 1), correct / num_data
def save_model(output_dir, model): os.makedirs(output_dir, exist_ok=True) logger.info(f'Saving model checkpoint to {output_dir}') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.config.architectures = [model_to_save.__class__.__name__] # architectures是什么 output_model_file = os.path.join(output_dir, 'pytorch.bin') torch.save(model_to_save.state_dict(), output_model_file) logger.info(f'Model weights saved in {output_model_file}') output_config_file = os.path.join(output_dir, 'config.json') model_to_save.config.to_json_file(output_config_file) logger.info(f'Configuration saved in {output_config_file}') torch.save(training_args, os.path.join(output_dir, 'training_args.bin'))
def evaluate(model, test_loader, device, criterion): correct = 0.0 category_correct = 0.0 num_data = 0.0 total_loss = 0.0 cat2correct = 0.0 label = [] prediction = [] cat_prediction = [] cat2_prediction = [] with torch.no_grad(): for i, data in enumerate(test_loader): x = data['image'] xlabel = data['label'] category_pos = data['category_possible'] category_oneh = data['category_onehot'] cat2possible = data['cat2possible'] cat2possible = cat2possible.to(device) category_pos = category_pos.to(device) category_oneh = category_oneh.to(device) x = x.to(device) xlabel = xlabel.to(device) out = model(x, category_oneh) logit, pred = out if isinstance(criterion, torch.nn.CrossEntropyLoss): loss = criterion(logit, xlabel) elif isinstance(criterion, LabelSmoothingLoss): loss = criterion(logit, xlabel, category_pos) correct += torch.sum(pred == xlabel).item() # category_pred = torch.argmax(logit*category_pos, dim=-1) # category_correct += torch.sum(category_pred == xlabel).item() cat2pred = torch.argmax(logit * cat2possible, dim=-1) cat2correct += torch.sum(cat2pred == xlabel).item() num_data += xlabel.size(0) total_loss += loss.item() label = label + xlabel.tolist() prediction = prediction + pred.detach().cpu().tolist() # cat_prediction = cat_prediction + category_pred.cpu().tolist() cat2_prediction = cat2_prediction + cat2pred.cpu().tolist() del x, xlabel torch.cuda.empty_cache() confusion = confusion_matrix(label, cat2_prediction) confusion_norm = confusion_matrix(label, cat2_prediction, normalize='true') logger.info(f'\n{confusion}') logger.info(f'\n{confusion_norm}') f1_array = f1_score(label, cat2_prediction, average=None) logger.info(f"f1 score : {f1_array}") f1_mean = gmean(f1_array) logger.info( 'validation loss: {loss:.4f}\v validation acc: {acc:.4f} \t validation category acc: {cat_acc:.4f}\v validation F1: {f1:.4f}' .format(loss=total_loss / (i + 1), acc=correct / num_data, f1=f1_mean, cat_acc=cat2correct / num_data)) return total_loss / (i + 1), correct / num_data, f1_mean
def main(): # Argument Settings parser = argparse.ArgumentParser( description='Image Tagging Classification from Naver Shopping Reviews') parser.add_argument('--sess_name', default='example', type=str, help='Session name that is loaded') parser.add_argument('--checkpoint', default='best', type=str, help='Checkpoint') parser.add_argument('--batch_size', default=256, type=int, help='batch size') parser.add_argument('--num_workers', default=16, type=int, help='The number of workers') parser.add_argument('--num_epoch', default=100, type=int, help='The number of epochs') parser.add_argument('--model_name', default='mobilenet_v2', type=str, help='[resnet50, rexnet, dnet1244, dnet1222]') parser.add_argument('--weight_file', default='model.pth', type=str) parser.add_argument('--optimizer', default='SGD', type=str) parser.add_argument('--lr', default=1e-2, type=float) parser.add_argument('--weight_decay', default=1e-5, type=float) parser.add_argument('--learning_anneal', default=1.1, type=float) parser.add_argument('--annealing_period', default=10, type=int) parser.add_argument('--num_gpu', default=1, type=int) parser.add_argument('--pretrain', action='store_true', default=False) parser.add_argument('--mode', default='train', help='Mode') parser.add_argument('--pause', default=0, type=int) parser.add_argument('--iteration', default=0, type=str) args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Model logger.info('Build Model') model = select_model(args.model_name, pretrain=args.pretrain, n_class=41) total_param = sum([p.numel() for p in model.parameters()]) logger.info(f'Model size: {total_param} tensors') load_weight(model, args.weight_file) model = model.to(device) nu.bind_model(model) nsml.save('best') if args.pause: nsml.paused(scope=locals()) if args.num_epoch == 0: return # Set the dataset logger.info('Set the dataset') df = pd.read_csv(f'{DATASET_PATH}/train/train_label') train_size = int(len(df) * 0.8) trainset = TagImageDataset(data_frame=df[:train_size], root_dir=f'{DATASET_PATH}/train/train_data', transform=train_transform) testset = TagImageDataset(data_frame=df[train_size:], root_dir=f'{DATASET_PATH}/train/train_data', transform=test_transform) train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(dataset=testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) criterion = nn.CrossEntropyLoss(reduction='mean') optimizer = select_optimizer(model.parameters(), args.optimizer, args.lr, args.weight_decay) criterion = criterion.to(device) if args.mode == 'train': logger.info('Start to train!') train_process(args=args, model=model, train_loader=train_loader, test_loader=test_loader, optimizer=optimizer, criterion=criterion, device=device) elif args.mode == 'test': nsml.load(args.checkpoint, session=args.sess_name) logger.info('[NSML] Model loaded from {}'.format(args.checkpoint)) model.eval() logger.info('Start to test!') test_loss, test_acc, test_f1 = evaluate(model=model, test_loader=test_loader, device=device, criterion=criterion) logger.info(test_loss, test_acc, test_f1)
from vaal.solver import VAE, Discriminator from vaal.training_args import TrainingArguments num_init_samples = 500 pool_batch_size = 128 @dataclass class DataTrainingArguments: max_seq_length: int = field(default=200) parser = HfArgumentParser((DataTrainingArguments, TrainingArguments)) data_args, training_args = parser.parse_args_into_dataclasses() logger.info(f'n_gpu: {training_args.n_gpu}') ####################################### # data prepare ####################################### class IntentDataset(data.Dataset): def __init__(self, file_name): self.data, self.targets = zip(*[(_['text'], _['label']) for _ in json.load(file_name.open()) if _['label'] != '负样本']) self._label2id() def __getitem__(self, idx): return self.data[idx], self.targets[idx]
def train(train_generator, dev_generator, pool_generator, task_model, vae, discriminator, args): num_epoch = args.epoch_num device = args.device n_gpu = args.n_gpu beta = args.beta optim_task = optim.Adam(task_model.parameters(), lr=5e-5) optim_vae = optim.Adam(vae.parameters(), lr=5e-5) optim_discriminator = optim.Adam(discriminator.parameters(), lr=5e-5) task_model.zero_grad() best_epoch, best_acc = 0, 0 for e in range(num_epoch): if args.check_debug and e > 0: break task_model.train() vae.train() discriminator.train() for idx, (labeled_batch, unlabeld_batch) in enumerate( zip(train_generator, pool_generator)): if args.check_debug and idx > 0: break raw_text = labeled_batch[-1] labeled_batch = [_.to(device) for _ in labeled_batch[:-1]] X_ids, Y_ids, V_ids, Mask = labeled_batch #################### # task model step #################### preds, task_loss = task_model(X_ids, Y_ids) if n_gpu > 0: task_loss = task_loss.mean() task_loss.backward() optim_task.step() task_model.zero_grad() ############# # vae step ############# recon, mu, logvar, z = vae(V_ids, Mask) vae_loss, mse_loss_value, kld_loss_value = vae_loss_func( V_ids, recon, mu, logvar, beta, Mask) if n_gpu > 0: vae_loss = vae_loss.mean() un_X_ids, _, un_V_ids, un_Mask = [ _.to(device) for _ in unlabeld_batch[:-1] ] un_recon, un_mu, un_logvar, un_z = vae(un_V_ids, un_Mask) un_vae_loss, un_mse_loss_value, un_kld_loss_value = vae_loss_func( un_V_ids, un_recon, un_mu, un_logvar, beta, Mask) if n_gpu > 0: un_vae_loss = un_vae_loss.mean() labeled_pred = discriminator(mu) unlabeled_pred = discriminator(un_mu) labeled_real_target = torch.ones(X_ids.size()[0], device=device) unlabeled_real_target = torch.ones(un_X_ids.size()[0], device=device) dsc_loss_in_vae = bce_loss( labeled_pred, labeled_real_target) + bce_loss( unlabeled_pred, unlabeled_real_target) if n_gpu > 0: dsc_loss_in_vae = dsc_loss_in_vae.mean() total_loss = vae_loss + un_vae_loss + dsc_loss_in_vae vae.zero_grad() total_loss.backward() optim_vae.step() ##################### # discriminate step ##################### mu_no_grad = mu.detach() un_mu_no_grad = un_mu.detach() labeled_pred = discriminator(mu_no_grad) unlabeled_pred = discriminator(un_mu_no_grad) labeled_real_target = torch.ones(X_ids.size()[0], device=device) unlabeled_fake_target = torch.zeros(un_X_ids.size()[0], device=device) dsc_loss = bce_loss(labeled_pred, labeled_real_target) + bce_loss( unlabeled_pred, unlabeled_fake_target) if n_gpu > 0: dsc_loss = dsc_loss.mean() discriminator.zero_grad() dsc_loss.backward() optim_discriminator.step() if idx % 10 == 0 and idx != 0: logger.info( f'epoch: {e} - batch: {idx}/{len(train_generator)}') logger.info(f'task_model loss: {task_loss}') logger.info(f'labeled vae loss: {vae_loss}') logger.info(f'labeled mse loss: {mse_loss_value}') logger.info(f'labeled kld loss: {kld_loss_value}') logger.info(f'unlabeled vae loss: {un_vae_loss}') logger.info(f'unlabeled mse loss: {un_mse_loss_value}') logger.info(f'unlabeled kld loss: {un_kld_loss_value}') logger.info(f'dsc_loss_in_vae: {dsc_loss_in_vae}') logger.info(f'dsc_loss: {dsc_loss}') task_model.eval() correct = 0 for idx, batch in enumerate(dev_generator): if args.check_debug and idx > 1: break raw_text = batch[-1] batch = [_.to(device) for _ in batch[:-1]] X_ids, Y_ids, _, _ = batch with torch.no_grad(): logits, _ = task_model(X_ids, Y_ids) logits = torch.argmax(logits, dim=-1) correct += logits.eq(Y_ids).sum() acc = correct.item() / dev_generator.total_data_size if acc > best_acc: best_acc = acc best_epoch = e logger.info( f'epoch {e} - acc: {acc} - best_acc: {best_acc} - best_epoch: {best_epoch}' ) return best_acc
X_ids, num_inference_samples).double(), non_blocking=True) with torch.no_grad(): candidate_batch = batchbald.get_batchbald_batch( logits_N_K_C.exp_(), acquisition_batch_size, num_samples, dtype=torch.double, device=training_args.device) targets = get_targets(active_learning_data.pool_dataset) dataset_indices = active_learning_data.get_dataset_indices( candidate_batch.indices) logger.info(f"Dataset indices: {dataset_indices}") logger.info(f"Scores: {candidate_batch.scores}") logger.info(f"Labels: {targets[candidate_batch.indices]}") logger.info( f"Labels name: {[intent_labels[idx] for idx in targets[candidate_batch.indices].detach().cpu().numpy()]}" ) logger.info('Metric: ') logger.info(metric) active_learning_data.acquire(candidate_batch.indices) added_indices.append(dataset_indices) pbar.update(len(dataset_indices)) record.append({ 'added indices': dataset_indices,
def binary_train(model, train_loader, optimizer, device, epoch, total_epochs): running_loss = 0.0 total_loss = 0.0 correct = 0.0 category_correct = 0.0 num_data = 0.0 for i, data in enumerate(train_loader): start = time.time() x = data['image'] xlabel = data['label'] pred = torch.zeros(xlabel.shape[0]).long().to(device) category_pos = data['category_possible'] category_oneh = data['category_onehot'] category = data['category'] x = x.to(device) xlabel = xlabel.to(device) category = category.to(device) category_pos = category_pos.to(device) category_oneh = category_oneh.to(device) optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 out = model(x,category_oneh, category if model.cat_embed else None) b_out, class_out, unclass_idx, class_idx = out if class_idx.shape[0] > 0: pred[class_idx] = torch.argmax(class_out[class_idx], dim=-1) pred[unclass_idx] = 4 binary_label = (xlabel[unclass_idx] == 4).float() class_label = xlabel[class_idx] falpos_idx = (class_label == 4).nonzero().squeeze(1) trupos_idx = (class_label < 4).nonzero().squeeze(1) binary_label = torch.cat([binary_label, torch.ones(falpos_idx.shape[0]).to(device)]) b_out = torch.cat([b_out[unclass_idx], b_out[falpos_idx]]) class_out = class_out[trupos_idx] class_label = class_label[trupos_idx] bin_loss = model.criterion_1(b_out, binary_label) class_loss = model.criterion_2(class_out, class_label) loss = bin_loss + class_loss loss.backward() optimizer.step() running_loss += loss.item() total_loss += loss.item() # category_pred = torch.argmax(logit*category_pos, dim=-1) # category_correct += torch.sum(category_pred == xlabel).item() correct += torch.sum(pred == xlabel).item() num_data += xlabel.size(0) if i % 100 == 0: # print every 100 mini-batches logger.info("epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec \t binary_loss {:.4f} \t class_loss {:.4f}".format(epoch+1, total_epochs, i, len(train_loader), running_loss / 2000, time.time() - start, bin_loss, class_loss)) running_loss = 0.0 logger.info( '[{}/{}]\tloss: {:.4f}\tacc: {:.4f}'.format(epoch+1, total_epochs, total_loss / (i + 1), correct / num_data)) del x, xlabel torch.cuda.empty_cache() return total_loss / (i + 1), correct / num_data
def train_main(p): in_file = Path(data_dir) / f'labeled_{p}.json' if isinstance( p, int) else Path(common_data_path) / 'intent_data' / p ############################################### # args ############################################### @dataclass class ModelArguments: model_path_or_name: str = field(default=str(bert_model_path)) # model_path_or_name: str = field(default=str(roberta_model_path)) # model_path_or_name: str = field(default=str(Path(data_dir)/'checkpoints'/'checkpoint-6000')) @dataclass class DataTrainingArguments: max_seq_length: int = field(default=200) parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() global_step = 0 ############################################### # distant debug ############################################### if training_args.server_ip and training_args.server_port: import ptvsd print('Waiting for debugger attach') ptvsd.enable_attach(address='') ############################################### # model ############################################### num_labels = len(intent_labels) config = AutoConfig.from_pretrained( pretrained_model_name_or_path=model_args.model_path_or_name, num_labels=num_labels) model = BertForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_args.model_path_or_name, config=config, num_labels=num_labels) ############################################### # data process ############################################### train = [(_['text'], _['label']) for _ in json.load(in_file.open())] dev = [(_['text'], _['label']) for _ in json.load((Path(common_data_path) / 'intent_data' / 'dev_data.json').open())] vocabulary = load_vocab() # vocabulary = load_vocab(vocab_file=(Path(roberta_model_path) / 'vocab.txt')) train_loader = DataGenerator(train, training_args, data_args, vocabulary, intent_labels, shuffle=True) dev_loader = DataGenerator(dev, training_args, data_args, vocabulary, intent_labels) ############################################### # optimizer ############################################### def get_optimizer(num_training_steps): no_decay = ['bias', 'LayerNorm.weight'] optimize_group_params = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': training_args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimize_group_params, lr=training_args.learning_rate, weight_decay=training_args.weight_decay) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler optimizer, scheduler = get_optimizer(num_training_steps=len(train_loader) * training_args.epoch_num / training_args.batch_size) ############################################### # continue training from checkpoints ############################################### if ('checkpoint' in model_args.model_path_or_name and os.path.isfile( os.path.join(model_args.model_path_or_name, 'optimizer.pt')) and os.path.isfile( os.path.join(model_args.model_path_or_name, 'scheduler.pt'))): optimizer.load_state_dict( torch.load( os.path.join(model_args.model_path_or_name, "optimizer.pt"), map_location='cuda' if torch.cuda.is_available() else 'cpu')) scheduler.load_state_dict( torch.load( os.path.join(model_args.model_path_or_name, "scheduler.pt"), map_location='cuda' if torch.cuda.is_available() else 'cpu')) epoch_trained = 0 step_trained_cur_epoch = 0 if 'checkpoint' in model_args.model_path_or_name: global_step = int( str(Path( model_args.model_path_or_name)).split('-')[-1].split('/')[0]) epoch_trained = global_step // ( train_loader.steps // training_args.gradient_accumulation_steps) step_trained_cur_epoch = global_step % ( train_loader.steps // training_args.gradient_accumulation_steps) logger.info( ' Continuing Training from checkpoint, will skip to saved global_step' ) logger.info(f' Continuing Training from epoch {epoch_trained}') logger.info(f' Continuing Training from global step {global_step}') logger.info( f' Will skip the first {step_trained_cur_epoch} steps in the first epoch' ) ############################################### # tensorboard ############################################### tb_writer = SummaryWriter(log_dir=Path(data_dir) / 'logs') def tb_log(logs): for k_, v_ in logs.items(): tb_writer.add_scalar(k_, v_, global_step) tb_writer.add_text('args', training_args.to_json_string()) tb_writer.add_hparams(training_args.to_sanitized_dict(), metric_dict={}) ############################################### # save ############################################### def save_model(output_dir, model): os.makedirs(output_dir, exist_ok=True) logger.info(f'Saving model checkpoint to {output_dir}') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.config.architectures = [ model_to_save.__class__.__name__ ] # architectures是什么 output_model_file = os.path.join(output_dir, 'pytorch.bin') torch.save(model_to_save.state_dict(), output_model_file) logger.info(f'Model weights saved in {output_model_file}') output_config_file = os.path.join(output_dir, 'config.json') model_to_save.config.to_json_file(output_config_file) logger.info(f'Configuration saved in {output_config_file}') torch.save(training_args, os.path.join(output_dir, 'training_args.bin')) def sorted_checkpoints(checkpoint_prefix="checkpoint", use_mtime=False): ordering_and_checkpoint_path = [] glob_checkpoints = [ str(x) for x in Path(training_args.output_dir).glob( f"{checkpoint_prefix}-*") ] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append( (os.path.getmtime(path), path)) else: regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) if regex_match and regex_match.groups(): ordering_and_checkpoint_path.append( (int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [ checkpoint[1] for checkpoint in checkpoints_sorted ] return checkpoints_sorted def rotate_checkpoints(use_mtime=False) -> None: if training_args.save_total_limit is None or training_args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = sorted_checkpoints(use_mtime=use_mtime) if len(checkpoints_sorted) <= training_args.save_total_limit: return number_of_checkpoints_to_delete = max( 0, len(checkpoints_sorted) - training_args.save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[: number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: logger.info( "Deleting older checkpoint [{}] due to args.save_total_limit". format(checkpoint)) shutil.rmtree(checkpoint) ############################################### # train ############################################### model.to(training_args.device) if training_args.n_gpu > 1: model = nn.DataParallel(model) best_acc = 0 best_epoch = 0 model.zero_grad() for e in range(epoch_trained, training_args.epoch_num): # for e in range(1): # debug model.train() t_loss = 0 logging_loss = 0 for step, batch in enumerate(train_loader): # if step > 0: break # debug if step_trained_cur_epoch > 0: step_trained_cur_epoch -= 1 continue raw_text = batch[-1] batch = [_.to(training_args.device) for _ in batch[:-1]] X_ids, Y_ids, Mask = batch if step < 5: logger.info(f'batch_size: {X_ids.size()}') loss, logits = model(X_ids, Y_ids, Mask) if training_args.n_gpu > 1: loss = loss.mean() loss.backward() t_loss += loss.item() if training_args.gradient_accumulation_steps > 1: loss = loss / training_args.gradient_accumulation_steps if ((step + 1) % training_args.gradient_accumulation_steps == 0 or (train_loader.steps <= training_args.gradient_accumulation_steps) and step + 1 == train_loader.steps): torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm=training_args.max_gradient_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 epoch = e + (step + 1) / train_loader.steps if global_step % training_args.logging_steps == 0: train_logs = { 'loss': (t_loss - logging_loss) / training_args.logging_steps, 'learning_rate': scheduler.get_lr()[0], 'epoch': epoch } logging_loss = t_loss tb_log(train_logs) logger.info( f'epoch: {e} - batch: {step}/{train_loader.steps} - loss: {t_loss / (step + 1): 6f}' ) # if global_step % training_args.saving_steps == 0: # output_dir = os.path.join(training_args.output_dir, f'checkpoint-{global_step}') # # save_model(output_dir, model) # rotate_checkpoints() # # torch.save(optimizer.state_dict(), Path(output_dir)/'optimizer.pt') # torch.save(scheduler.state_dict(), Path(output_dir)/'scheduler.pt') # logger.info(f'Saving optimizer and scheduler states to {output_dir}') model.eval() dev_acc = 0 eval_loss = 0 err = [] cat = defaultdict(lambda: 1e-10) for k, batch in enumerate(dev_loader): # if k > 0: break # debug raw_text = batch[-1] batch = [_.to(training_args.device) for _ in batch[:-1]] X_ids, Y_ids, Mask = batch with torch.no_grad(): loss, logits = model(X_ids, Y_ids, Mask) if training_args.n_gpu > 1: loss = loss.mean() eval_loss += loss.item() for logit, y_id, t in zip(logits, Y_ids, raw_text): logit = logit.detach().cpu().numpy() true_label = y_id.detach().cpu().numpy() pred_label = np.argmax(logit) # metric 1 if true_label == pred_label: dev_acc += 1 else: score = max(logit) err.append({ 'text': t, 'pred': intent_labels[pred_label], 'true': intent_labels[true_label], 'score': f'{score: .4f}' }) # metric 2 cat[f'{intent_labels[true_label]}_A'] += int( pred_label == true_label) cat[f'{intent_labels[true_label]}_B'] += 1 cat[f'{intent_labels[pred_label]}_C'] += 1 acc = dev_acc / len(dev_loader) eval_logs = { 'eval_acc': acc, 'eval_loss': eval_loss / dev_loader.steps, } tb_log(eval_logs) if acc > best_acc: # if acc >= best_acc: # debug best_acc = acc best_epoch = e # save # model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), Path(data_dir) / f'cls_model_{p}.pt') # save # json.dump(err, (Path(data_dir) / 'err.json').open('w'), ensure_ascii=False, indent=4) logger.info( f'epoch: {e} - dev_acc: {acc:.5f} {dev_acc}/{len(dev_loader)} - best_score: {best_acc:.5f} - best_epoch: {best_epoch} ' ) for t in intent_labels: logger.info( f'cat: {t} - ' f'precision: {cat[t + "_A"] / cat[t + "_C"]:.5f} - ' f'recall: {cat[t + "_A"] / cat[t + "_B"]:.5f} - ' f'f1: {2 * cat[t + "_A"] / (cat[t + "_B"] + cat[t + "_C"]):.5f}' ) tb_writer.close()
def binary_evaluate(model, test_loader, device): correct = 0.0 category_correct = 0.0 num_data = 0.0 total_loss = 0.0 label = [] prediction = [] with torch.no_grad(): for i, data in enumerate(test_loader): x = data['image'] xlabel = data['label'] pred = torch.zeros(xlabel.shape[0]).long().to(device) category_pos = data['category_possible'] category_oneh = data['category_onehot'] category = data['category'] category = category.to(device) category_pos = category_pos.to(device) category_oneh = category_oneh.to(device) x = x.to(device) xlabel = xlabel.to(device) out = model(x,category_oneh, category if model.cat_embed else None) b_out, class_out, unclass_idx, class_idx = out if class_idx.shape[0] > 0: pred[class_idx] = torch.argmax(class_out[class_idx], dim=-1) pred[unclass_idx] = 4 binary_label = (xlabel[unclass_idx] == 4).float() class_label = xlabel[class_idx] falpos_idx = (class_label == 4).nonzero().squeeze(1) trupos_idx = (class_label < 4).nonzero().squeeze(1) binary_label = torch.cat([binary_label, torch.ones(falpos_idx.shape[0]).to(device)]) b_out = torch.cat([b_out[unclass_idx], b_out[falpos_idx]]) class_out = class_out[trupos_idx] class_label = class_label[trupos_idx] bin_loss = model.criterion_1(b_out, binary_label) class_loss = model.criterion_2(class_out, class_label) loss = bin_loss + class_loss # category_pred = torch.argmax(logit*category_pos, dim=-1) # category_correct += torch.sum(category_pred == xlabel).item() correct += torch.sum(pred == xlabel).item() num_data += xlabel.size(0) total_loss += loss.item() label = label + xlabel.tolist() prediction = prediction + pred.detach().cpu().tolist() del x, xlabel torch.cuda.empty_cache() confusion = confusion_matrix(label,prediction) confusion_norm = confusion_matrix(label,prediction, normalize='true') logger.info(f'\n{confusion}') logger.info(f'\n{confusion_norm}') f1_array = f1_score(label, prediction, average=None) logger.info(f"f1 score : {f1_array}") f1_mean = gmean(f1_array) logger.info('validation loss: {loss:.4f}\v validation acc: {acc:.4f} \v validation F1: {f1:.4f}' .format(loss=total_loss / (i + 1), acc=correct / num_data, f1=f1_mean)) return total_loss / (i + 1), correct / num_data, f1_mean
def train_main(train_loader, vocabulary, id2embeddings): ############################################### # args ############################################### @dataclass class ModelArguments: model_path_or_name: str = field(default=str(bert_model_path)) # model_path_or_name: str = field(default=str(roberta_model_path)) # model_path_or_name: str = field(default=str(Path(data_dir)/'checkpoints'/'checkpoint-6000')) @dataclass class DataTrainingArguments: max_seq_length: int = field(default=200) parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() global_step = 0 ############################################### # distant debug ############################################### if training_args.server_ip and training_args.server_port: import ptvsd print('Waiting for debugger attach') ptvsd.enable_attach(address='') ############################################### # model ############################################### id2embeddings = torch.tensor(id2embeddings, dtype=torch.float).to(training_args.device) num_labels = len(intent_labels) model = TextCnnMC(id2embeddings, num_labels) ############################################### # data process ############################################### dev = [(_['text'], _['label']) for _ in json.load((Path(common_data_path) / 'intent_data' / 'dev_data.json').open())] dev_loader = DataGeneratorW2V(dev, training_args.batch_size, data_args, vocabulary, intent_labels) ############################################### # optimizer ############################################### optimizer = torch.optim.Adam( [p for n, p in list(model.named_parameters())], lr=5e-5) ############################################### # train ############################################### model.to(training_args.device) logger.info(f'gpu num: {training_args.n_gpu}') if training_args.n_gpu > 1: model = nn.DataParallel(model) loss_func = LabelSmoothLoss(num_labels) best_acc = 0 best_epoch = 0 model.zero_grad() for e in range(training_args.epoch_num): # for e in range(1): # debug model.train() t_loss = 0 logging_loss = 0 for step, batch in enumerate(train_loader): # if step > 0: break # debug raw_text = batch[-1] batch = [_.to(training_args.device) for _ in batch[:-1]] X_ids, Y_ids = batch if step < 1: logger.info(f'batch_size: {X_ids.size()[0]}') logits = model(X_ids, 1).squeeze(1) loss = loss_func(Y_ids, logits) if training_args.n_gpu > 1: loss = loss.mean() loss.backward() t_loss += loss.item() if training_args.gradient_accumulation_steps > 1: loss = loss / training_args.gradient_accumulation_steps if ((step + 1) % training_args.gradient_accumulation_steps == 0 or (train_loader.steps <= training_args.gradient_accumulation_steps) and step + 1 == train_loader.steps): torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm=training_args.max_gradient_norm) optimizer.step() model.zero_grad() global_step += 1 # if global_step % training_args.logging_steps == 0: logger.info( f'epoch: {e} - batch: {step}/{train_loader.steps} - loss: {t_loss / (step + 1): 6f}' ) model.eval() dev_acc = 0 eval_loss = 0 err = [] cat = defaultdict(lambda: 1e-10) for k, batch in enumerate(dev_loader): # if k > 0: break # debug raw_text = batch[-1] batch = [_.to(training_args.device) for _ in batch[:-1]] X_ids, Y_ids = batch with torch.no_grad(): logits = model(X_ids, 10).squeeze(1) logits = torch.logsumexp(logits, dim=1) - math.log(10) loss = loss_func(Y_ids, logits) if training_args.n_gpu > 1: loss = loss.mean() eval_loss += loss.item() for logit, y_id, t in zip(logits, Y_ids, raw_text): logit = logit.detach().cpu().numpy() true_label = y_id.detach().cpu().numpy() pred_label = np.argmax(logit) # metric 1 if true_label == pred_label: dev_acc += 1 else: score = max(logit) err.append({ 'text': t, 'pred': intent_labels[pred_label], 'true': intent_labels[true_label], 'score': f'{score: .4f}' }) # metric 2 cat[f'{intent_labels[true_label]}_A'] += int( pred_label == true_label) cat[f'{intent_labels[true_label]}_B'] += 1 cat[f'{intent_labels[pred_label]}_C'] += 1 acc = dev_acc / (len(dev_loader) * training_args.batch_size) if acc > best_acc: # if acc >= best_acc: # debug best_acc = acc best_epoch = e metric = {'epoch': best_epoch, 'acc': best_acc} # save # model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), Path(data_dir) / f'cls_model.pt') # save # json.dump(err, (Path(data_dir) / 'err.json').open('w'), ensure_ascii=False, indent=4) logger.info( f'epoch: {e} - dev_acc: {acc:.5f} {dev_acc}/{len(dev_loader)*training_args.batch_size} - ' f'best_score: {best_acc:.5f} - best_epoch: {best_epoch} ') # for t in intent_labels: # logger.info(f'cat: {t} - ' # f'precision: {cat[t + "_A"] / cat[t + "_C"]:.5f} - ' # f'recall: {cat[t + "_A"] / cat[t + "_B"]:.5f} - ' # f'f1: {2 * cat[t + "_A"] / (cat[t + "_B"] + cat[t + "_C"]):.5f}') return metric, model_to_save
def ensemble_training(model, train_loader, optimizer, criterion, device, epoch, total_epochs): running_loss = 0.0 total_loss = 0.0 correct = 0.0 category_correct = 0.0 num_data = 0.0 cat2correct = 0.0 ytrues = [] ypreds = [] for i, data in enumerate(train_loader): start = time.time() x = data['image'] xlabel = data['label'] category_pos = data['category_possible'] category_oneh = data['category_onehot'] category = data['category'] cat2possible = data['cat2possible'] cat2possible = cat2possible.to(device) category = category.to(device) x = x.to(device) xlabel = xlabel.to(device) category_pos = category_pos.to(device) category_oneh = category_oneh.to(device) optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 out = model(x, category_oneh, category) logit, pred = out num_data += xlabel.size(0) if model.mode == "xgb": ytrues.append(xlabel) ypreds.append(logit) else: if isinstance(criterion, torch.nn.CrossEntropyLoss): loss = criterion(logit, xlabel) elif isinstance(criterion, LabelSmoothingLoss): loss = criterion(logit, xlabel, category_pos) loss.backward() optimizer.step() running_loss += loss.item() total_loss += loss.item() # category_pred = torch.argmax(logit*category_pos, dim=-1) # category_correct += torch.sum(category_pred == xlabel).item() cat2pred = torch.argmax(logit * cat2possible, dim=-1) cat2correct += torch.sum(cat2pred == xlabel).item() correct += torch.sum(pred == xlabel).item() if i % 100 == 0: # print every 100 mini-batches logger.info( "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec" .format(epoch + 1, total_epochs, i, len(train_loader), running_loss / 2000, time.time() - start)) running_loss = 0.0 if model.mode == "xgb": ypreds = torch.cat(ypreds, axis=0) ytrues = torch.cat(ytrues, axis=0) ypreds = ypreds.detach().cpu().numpy() ytrues = ytrues.detach().cpu().numpy() model.xgb_classifier.fit(ypreds, ytrues) ypreds = model.xgb_classifier.predict(ypreds) cat2correct = np.sum((ytrues == ypreds).astype(int)) elif model.mode == "hard": logger.info( f"\n###############\nEnsemble {model.num_model} number of models weight = {model.w} \n##############" ) logger.info( '[{}/{}]\tloss: {:.4f}\tacc: {:.4f} \tcategory_acc : {:.4f}'.format( epoch + 1, total_epochs, total_loss / (i + 1), correct / num_data, cat2correct / num_data)) del x, xlabel torch.cuda.empty_cache() return total_loss / (i + 1), cat2correct / num_data