def train_model(rank, world_size, args): if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master: wandb.init(project="transformer-evolution") vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss, best_score = 0, 0, 0 model = transformer.QA(config) if os.path.isfile(args.save): best_epoch, best_loss, best_score = model.load(args.save) print(f"rank: {rank} load state dict from: {args.save}") if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) if master: wandb.watch(model) criterion = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True) test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) offset = best_epoch for step in trange(args.epoch, desc="Epoch"): if train_sampler: train_sampler.set_epoch(step) epoch = step + offset loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader) score = eval_epoch(config, rank, model, test_loader) if master: wandb.log({"loss": loss, "accuracy": score}) if master and best_score < score: best_epoch, best_loss, best_score = epoch, loss, score if isinstance(model, DistributedDataParallel): model.module.save(best_epoch, best_loss, best_score, args.save) else: model.save(best_epoch, best_loss, best_score, args.save) print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}") if 1 < args.n_gpu: destroy_process_group()
def configure_optimizers(self): optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.999), eps=1e-8) lr_scheduler = get_linear_schedule_with_warmup(optimizer, self.n_warmup_steps, self.n_training_steps) return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler}
def train_model(rank, world_size, args): print('dd22') if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab = len(vocab) # GPU 사용 여부를 확인합니다. config.device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss = 0, 0 """학습 실행""" # BERTPretrain을 생성합니다. model = bert.BERTPretrain(config) # 기존에 학습된 pretrain 값이 있다면 이를 로드 합니다. if os.path.isfile(args.save): best_epoch, best_loss = model.bert.load(args.save) print( f"rank: {rank} load pretrain from: {args.save}, epoch={best_epoch}, loss={best_loss}" ) best_epoch += 1 # BERTPretrain이 GPU 또는 CPU를 지원하도록 합니다. if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) # MLM loss(criterion_lm) 및 NLP loss(criterion_cls) 함수를 선언 합니다. criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader = data.build_pretrain_loader(vocab, args, epoch=best_epoch, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # optimizer를 선언 합니다. optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) offset = best_epoch losses = [] for step in trange(args.epoch, desc="Epoch"): print('step offset') print(step) print(offset) epoch = step + offset # 각 epoch 마다 새로 train_loader를 생성 합니다. # step이 0인 경우는 위에서 생성했기 때문에 생성하지 않습니다. if 0 < step: del train_loader train_loader = data.build_pretrain_loader(vocab, args, epoch=epoch, shuffle=True) # 각 epoch 마다 학습을 합니다. loss = train_epoch(config, rank, epoch, model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) losses.append(loss) if master: best_epoch, best_loss = epoch, loss if isinstance(model, DistributedDataParallel): model.module.bert.save(best_epoch, best_loss, args.save) else: model.bert.save(best_epoch, best_loss, args.save) print( f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}" ) print(f">>>> rank: {rank} losses: {losses}") if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): """ 모델 학습 """ if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master and args.wandb: wandb.init(project=args.project) vocab = load_vocab(args.vocab) config = Config.load(args.config) config.n_enc_vocab = len(vocab) config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu" print(config) best_epoch, best_loss = 0, 0 train_model = ALBERTPretrain(config) if os.path.isfile(args.pretrain_save): try: best_epoch, best_loss = train_model.albert.load(args.pretrain_save) print( f"load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={best_epoch}, loss={best_loss:.4f}" ) except: print(f'load {os.path.basename(args.pretrain_save)} failed.') if 1 < args.n_gpu: train_model.to(config.device) # noinspection PyArgumentList train_model = DistributedDataParallel(train_model, device_ids=[rank], find_unused_parameters=True) else: train_model.to(config.device) if master and args.wandb: wandb.watch(train_model) criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader: DataLoader = data.build_pretrain_loader(vocab, args, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in train_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in train_model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) start_epoch = best_epoch + 1 losses = [] with trange(args.epoch, desc="Epoch", position=0) as pbar: pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}") for step in pbar: epoch = step + start_epoch loss = train_epoch(config, rank, train_model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) losses.append(loss) if master and args.wandb: wandb.log({"loss": loss}) if master: best_epoch, best_loss = epoch, loss if isinstance(train_model, DistributedDataParallel): train_model.module.albert.save(best_epoch, best_loss, args.pretrain_save) else: train_model.albert.save(best_epoch, best_loss, args.pretrain_save) pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}") if 1 < args.n_gpu: destroy_process_group()
def train_model(rank, world_size, args): if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss = 0, 0 model = albert.ALBERTPretrain(config) if os.path.isfile(args.save): model.albert.load(args.save) print(f"rank: {rank} load pretrain from: {args.save}") if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean') criterion_cls = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_pretrain_loader(vocab, args, shuffle=True) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) offset = best_epoch for step in trange(args.epoch, desc="Epoch"): if train_sampler: train_sampler.set_epoch(step) epoch = step + offset loss = train_epoch(config, rank, epoch, model, criterion_lm, criterion_cls, optimizer, scheduler, train_loader) if master: best_epoch, best_loss = epoch, loss if isinstance(model, DistributedDataParallel): model.module.albert.save(best_epoch, best_loss, args.save) else: model.albert.save(best_epoch, best_loss, args.save) print( f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}" ) if 1 < args.n_gpu: destroy_process_group()
def main(args): if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() args.data_dir = os.path.join(args.data_dir, args.task_name) args.output_dir = os.path.join(args.output_dir, args.task_name) logger.info("args = %s", args) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): try: os.makedirs(args.output_dir) except: logger.info("catch a error") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # use bert to aug train_examples ori_train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) if args.double_ori == 0: num_train_optimization_steps = int( len(ori_train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs else: num_train_optimization_steps = int( len(ori_train_examples) * 2 / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) config_class, tokenizer_class = (RobertaConfig, RobertaTokenizer) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.use_saved == 1: bert_saved_dir = args.ckpt if args.co_training: model_class = RobertaForNSP_co model = model_class.from_pretrained(bert_saved_dir, args=args) elif args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained(bert_saved_dir) tokenizer = tokenizer_class.from_pretrained(bert_saved_dir) else: model_class = RobertaForNSPAug model = model_class.from_pretrained(bert_saved_dir, args=args) else: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=task_name, cache_dir=args.cache_dir if args.cache_dir else None) if args.only_bert: model_class = RobertaForSequenceClassification model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) else: model_class = RobertaForNSPAug model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, args=args) model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) cnt = np.sum(np.prod(v.size()) for name, v in model.named_parameters()) / 1e6 logger.info("cnt %s", str(cnt)) if args.do_first_eval: args.do_train = False res_file = os.path.join(args.output_dir, "first_test.tsv") eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, 0, output_mode, num_labels, task_name, eval_examples, type="dev") eval_res.update(res_parts) for key in sorted(eval_res.keys()): logger.info("first evaluation: %s = %s", key, str(eval_res[key])) idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds}) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join(args.output_dir, "first_test_mm.tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") if args.do_train: # Prepare optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(args.warmup_rate * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) global_step = 0 best_val_acc = 0.0 first_time = time.time() logger.info("***** Running training *****") logger.info(" Num original examples = %d", len(ori_train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() aug_ratio = 0.0 aug_seed = np.random.randint(0, 1000) for epoch in range(int(args.num_train_epochs)): if args.only_bert: train_features = convert_examples_to_features( ori_train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) else: logger.info("epoch=%d, aug_ratio = %f, aug_seed=%d", epoch, aug_ratio, aug_seed) train_examples = Aug_each_ckpt( ori_train_examples, label_list, model, tokenizer, args=args, num_show=args.num_show, output_mode=output_mode, seed=aug_seed, aug_ratio=aug_ratio, use_bert=False, do_roberta=1, ssa_roberta=1, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0]) if aug_ratio + args.aug_ratio_each < 1.0: aug_ratio += args.aug_ratio_each aug_seed += 1 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args, pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], do_roberta=1) logger.info("Done convert features") all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) token_real_label = torch.tensor( [f.token_real_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, token_real_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info("begin training") tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0 nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0 preds = [] all_labels = [] for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, token_real_label = batch if args.only_bert: outputs = model(input_ids, input_mask) seq_logits = outputs[0] else: seq_logits, aug_logits, aug_loss = model( input_ids, input_mask, labels=None, token_real_label=token_real_label) if output_mode == "classification": loss_fct = CrossEntropyLoss() seq_loss = loss_fct(seq_logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() seq_loss = loss_fct(seq_logits.view(-1), label_ids.view(-1)) token_real_label = token_real_label.detach().cpu().numpy() w = args.aug_loss_weight if args.only_bert: loss = seq_loss else: loss = (1 - w) * seq_loss + w * aug_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 10000.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tr_seq_loss += seq_loss.mean().item() seq_logits = seq_logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() if len(preds) == 0: preds.append(seq_logits) all_labels.append(label_ids) else: preds[0] = np.append(preds[0], seq_logits, axis=0) all_labels[0] = np.append(all_labels[0], label_ids, axis=0) if args.only_bert == 0: aug_logits = aug_logits.detach().cpu().numpy() tmp_train_aug_accuracy, tmp_tokens = accuracy( aug_logits, token_real_label, type="aug") train_aug_accuracy += tmp_train_aug_accuracy nb_tr_tokens += tmp_tokens tr_aug_loss += aug_loss.mean().item() if global_step % 20 == 0: loss = tr_loss / nb_tr_steps seq_loss = tr_seq_loss / nb_tr_steps aug_loss = tr_aug_loss / nb_tr_steps tmp_pred = preds[0] tmp_labels = all_labels[0] if output_mode == "classification": tmp_pred = np.argmax(tmp_pred, axis=1) elif output_mode == "regression": tmp_pred = np.squeeze(tmp_pred) res = accuracy(tmp_pred, tmp_labels, task_name=task_name) if nb_tr_tokens != 0: aug_avg = train_aug_accuracy / nb_tr_tokens else: aug_avg = 0.0 log_string = "" log_string += "epoch={:<5d}".format(epoch) log_string += " step={:<9d}".format(global_step) log_string += " total_loss={:<9.7f}".format(loss) log_string += " seq_loss={:<9.7f}".format(seq_loss) log_string += " aug_loss={:<9.7f}".format(aug_loss) log_string += " lr={:<9.7f}".format(scheduler.get_lr()[0]) log_string += " |g|={:<9.7f}".format(total_norm) #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg) log_string += " tr_aug_acc={:<9.7f}".format(aug_avg) log_string += " mins={:<9.2f}".format( float(time.time() - first_time) / 60) for key in sorted(res.keys()): log_string += " " + key + "= " + str(res[key]) logger.info(log_string) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and epoch % 1 == 0: eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev") if "acc" in eval_res: tmp_acc = eval_res["acc"] elif "mcc" in eval_res: tmp_acc = eval_res["mcc"] else: tmp_acc = eval_res["corr"] if tmp_acc >= best_val_acc: best_val_acc = tmp_acc dev_test = "dev" model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_dir = os.path.join(args.output_dir, "dev_" + str(tmp_acc)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) output_model_file = os.path.join(output_model_dir, 'pytorch_model.bin') torch.save(model_to_save.state_dict(), output_model_file) result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'best_epoch': epoch, 'train_batch_size': args.train_batch_size, 'args': args } result.update(eval_res) result.update(res_parts) output_eval_file = os.path.join( args.output_dir, dev_test + "_results_" + str(tmp_acc) + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # write test results if args.do_test: res_file = os.path.join( args.output_dir, "test_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done ") # write mm test results if task_name == "mnli": res_file = os.path.join( args.output_dir, "mm_roberta_results_b_" + str(tmp_acc) + ".tsv") idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, do_mm=True) dataframe = pd.DataFrame({ 'index': range(idx), 'prediction': preds }) dataframe.to_csv(res_file, index=False, sep='\t') logger.info(" Num test length = %d", idx) logger.info(" Done write mm") else: logger.info(" tmp_val_acc = %f", tmp_acc)
def train_model(rank, world_size, args): """ 모델 학습 """ master = (world_size == 0 or rank % world_size == 0) if master and args.wandb: wandb.init(project=args.project, resume=args.name, tags=args.tags) if 1 < args.n_gpu: init_process_group(rank, world_size) vocab = load_vocab(args.vocab) config = Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss, best_score = 0, 0, 0 model: MovieClassification = transformer.MovieClassification(config) if args.resume and os.path.isfile(args.save): best_epoch, best_loss, best_score = model.load(args.save) print(f"rank: {rank}, last epoch: {best_epoch} load state dict from: {os.path.basename(args.save)}") model.to(config.device) if master and args.wandb: wandb.watch(model) if 1 < args.n_gpu: model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) criterion = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_train.json")), args, shuffle=True) test_loader, test_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_test.json")), args, shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, last_epoch=best_epoch) print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB') with tqdm(initial=best_epoch + 1, total=args.epoch, position=0) as pbar: for epoch in range(best_epoch + 1, args.epoch + 1): if train_sampler: train_sampler.set_epoch(epoch) train_loss = train_epoch(args, config, rank, epoch, model, criterion, optimizer, scheduler, train_loader) test_loss, test_accuracy = eval_epoch(config, rank, model, test_loader, test_sampler) if master and args.wandb: wandb.config.update(args) wandb.log(row={"train loss": train_loss, "accuracy": test_accuracy}, step=epoch) if master: if best_score < test_accuracy: best_epoch, best_loss, best_score = epoch, train_loss, test_accuracy pbar.set_description(f'Best (score={best_score:.3f}, epoch={best_epoch})') if isinstance(model, DistributedDataParallel): model.module.save(best_epoch, best_loss, best_score, args.save) else: model.save(best_epoch, best_loss, best_score, args.save) else: if best_epoch + 5 < epoch: # early stop break pbar.update() break print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB') if master and args.wandb: wandb.save(args.name) if 1 < args.n_gpu: destroy_process_group()
def train(config): # electra config 객체 생성 electra_config = ElectraConfig.from_pretrained(config["train_model_path"], num_labels=config["num_labels"], cache_dir=config["cache_dir_path"]) # electra tokenizer 객체 생성 electra_tokenizer = ElectraTokenizer.from_pretrained(config["train_model_path"], do_lower_case=False, cache_dir=config["cache_dir_path"]) # electra model 객체 생성 electra_model = ElectraForSequenceClassification.from_pretrained(config["train_model_path"], config=electra_config, cache_dir=config["cache_dir_path"]) # electra_model.cuda() # 학습 데이터 읽기 train_datas = read_data(file_path=config["train_data_path"]) # 학습 데이터 전처리 train_dataset = convert_data2dataset(datas=train_datas, tokenizer=electra_tokenizer, max_length=config["max_length"]) # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config["batch_size"]) # 평가 데이터 읽기 test_datas = read_data(file_path=config["test_data_path"]) # 평가 데이터 전처리 test_dataset = convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, max_length=config["max_length"]) # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) # 전체 학습 횟수(batch 단위) t_total = len(train_dataloader) // config["gradient_accumulation_steps"] * config["epoch"] # 모델 학습을 위한 optimizer optimizer = AdamW(electra_model.parameters(), lr=config["learning_rate"]) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config["warmup_steps"], num_training_steps=t_total) if os.path.isfile(os.path.join(config["model_dir_path"], "optimizer.pt")) and os.path.isfile( os.path.join(config["model_dir_path"], "scheduler.pt")): # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴 optimizer.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "scheduler.pt"))) global_step = 0 electra_model.zero_grad() max_test_accuracy = 0 for epoch in range(config["epoch"]): electra_model.train() # 학습 데이터에 대한 정확도와 평균 loss train_accuracy, average_loss, global_step = do_train(config=config, electra_model=electra_model, optimizer=optimizer, scheduler= scheduler, train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step) print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) electra_model.eval() # 평가 데이터에 대한 정확도 test_accuracy = do_evaluate(electra_model=electra_model, test_dataloader=test_dataloader, mode=config["mode"]) print("test_accuracy : {}\n".format(round(test_accuracy, 4))) # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장 if(max_test_accuracy < test_accuracy): max_test_accuracy = test_accuracy output_dir = os.path.join(config["model_dir_path"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) print("save model in checkpoint-{}\n".format(global_step)) electra_config.save_pretrained(output_dir) electra_tokenizer.save_pretrained(output_dir) electra_model.save_pretrained(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))