_opt = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9) optimizer = get_optimizer(_opt) if LOAD_MODEL: logging.info('---------- Load Models ----------') save_obj = torch.load(f'{OUTPUT_DIR}/{FN}.pth') model.module.load(save_obj['model']) optimizer.load(save_obj['opt'], save_obj['param']) start_epoch = save_obj['epoch'] logging.info('---------- Preparing training data ----------') with open(TRAIN_DATA_PATH, 'rb') as f: train_data = pickle.load(f) with open(TEST_DATA_PKL_PATH, 'rb') as f: valid_data = pickle.load(f) train_dataset = DialogDataset(train_data) valid_dataset = DialogDataset(valid_data) train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) valid_data_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True) logging.info('---------- Start Training ----------') one_cycle(model, optimizer, criterion, train_data_loader, valid_data_loader, sp, device, start_epoch)
seed_everything(Config.seed) device = torch.device(Config.device) start_epoch = 0 logging.info('Define Models') model = build_model(Config).to(device) tokenizer = Tokenizer.from_pretrained(Config.model_name) logging.info('Define Loss and Optimizer') criterion = LabelSmoothing(tokenizer.vocab_size, pad_id=tokenizer.pad_token_id, smoothing=Config.smoothing) _opt = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9) optimizer = get_optimizer(_opt, factor=Config.factor, warmup=Config.warmup) logging.info('Preparing training data') if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) dataset = DialogDataset(train_data, tokenizer) logging.info('Start Training') for epoch in range(start_epoch, Config.n_epoch): one_cycle(epoch, Config, model, optimizer, criterion, BalancedDataLoader(dataset, tokenizer.pad_token_id), tokenizer, device) evaluate(Config, 'もう疲れたー', tokenizer, model, device)
{ "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=Config.learning_rate, eps=Config.adam_epsilon) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=t_total * 0.2, num_training_steps=t_total) if Config.load: state_dict = torch.load(f'{Config.data_dir}/{Config.fn}.pth', map_location=device) start_epoch = 0 print(f'Start Epoch: {start_epoch}') model.load_state_dict(state_dict['model'], strict=False) # optimizer.load_state_dict(state_dict['opt'],strict=False) # scheduler.load_state_dict(state_dict['scheduler']) logging.info('Start Training') print("Total steps: ", t_total) for epoch in range(start_epoch, Config.num_train_epochs): one_cycle(epoch, Config, model, optimizer, scheduler, data_loader, tokenizer, device)