def get_model(): # mask_model.py에 정의된 특정 모델을 가져옵니다. model_module = getattr(import_module("recycle_model"), CFG.model) model = model_module(num_classes=12) # 모델의 파라미터를 GPU메모리로 옮깁니다. model.cuda() # wandb에서 model 감독 wandb.watch(model) # 모델의 파라미터 수를 출력합니다. print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다. n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # loss.py에 정의된 criterion을 가져옵니다. criterion = create_criterion(CFG.criterion) # optimizer.py에 정의된 optimizer를 가져옵니다. optimizer_encoder = create_optimizer( CFG.optimizer, params=model.seg_model.encoder.parameters(), lr=1e-8) optimizer_decoder = create_optimizer( CFG.optimizer, params=[{ "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }], lr=1e-8) # scheduler.py에 정의된 scheduler를 가져옵니다. scheduler_encoder = create_scheduler(CFG.scheduler, optimizer=optimizer_encoder, T_0=30, T_mult=2, eta_max=CFG.learning_rate * 0.1, T_up=5, gamma=0.3) scheduler_decoder = create_scheduler(CFG.scheduler, optimizer=optimizer_decoder, T_0=30, T_mult=2, eta_max=CFG.learning_rate, T_up=5, gamma=0.3) return model, criterion, optimizer_encoder, optimizer_decoder, scheduler_encoder, scheduler_decoder
def main(args): wandb.init(project="stage-1", reinit=True) wandb.run.name = args.MODEL wandb.config.update(args) args = wandb.config train_loader, val_loader = get_loader(args.BATCH_SIZE) print("Get loader") model = get_res_pre_trained(args.MODEL).to(args.device) print("Load model") wandb.watch(model) criterion = create_criterion(args.LOSS) optimizer = optim.Adam(model.parameters(), lr=args.LEARNING_RATE) print("Run") run(args, model, criterion, optimizer, train_loader, val_loader)
def train(data_dir, model_dir, args): seed_everything(args.seed) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # default: BaseAugmentation dataset = dataset_module(data_dir=data_dir, ) num_classes = dataset.num_classes # 18 # -- augmentation transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_loader = DataLoader( train_set, batch_size=args.batch_size, num_workers=8, shuffle=True, pin_memory=use_cuda, drop_last=True, ) val_loader = DataLoader( val_set, batch_size=args.valid_batch_size, num_workers=8, shuffle=False, pin_memory=use_cuda, drop_last=True, ) # -- model model_module = getattr(import_module("model"), args.model) # default: BaseModel model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) # -- loss & metric criterion = create_criterion(args.criterion) # default: cross_entropy opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD optimizer = opt_module(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4) scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) # -- logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() loss_value = 0 matches = 0 for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) loss.backward() optimizer.step() loss_value += loss.item() matches += (preds == labels).sum().item() if (idx + 1) % args.log_interval == 0: train_loss = loss_value / args.log_interval train_acc = matches / args.batch_size / args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + idx) loss_value = 0 matches = 0 scheduler.step() # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() val_loss_items = [] val_acc_items = [] figure = None for val_batch in val_loader: inputs, labels = val_batch inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss_item = criterion(outs, labels).item() acc_item = (labels == preds).sum().item() val_loss_items.append(loss_item) val_acc_items.append(acc_item) if figure is None: inputs_np = torch.clone(inputs).detach().cpu().permute( 0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels, preds, args.dataset != "MaskSplitByProfileDataset") val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_set) best_val_loss = min(best_val_loss, val_loss) if val_acc > best_val_acc: print( f"New best model for val accuracy : {val_acc:4.2%}! saving the best model.." ) torch.save(model.module.state_dict(), f"{save_dir}/best.pth") best_val_acc = val_acc torch.save(model.module.state_dict(), f"{save_dir}/last.pth") print( f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || " f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}" ) logger.add_scalar("Val/loss", val_loss, epoch) logger.add_scalar("Val/accuracy", val_acc, epoch) logger.add_figure("results", figure, epoch) print()
def get_model(train_iter): # get model from mask_model.py and define with parameters model_module = getattr(import_module("mask_model"), CFG.model) model = model_module() # Upload data to gpu memory model.cuda() # print number of parameters(weights) of defined model print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # if exists more than 2 GPUs, use DataParallel training n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # get criterion from loss.py and define with parameters criterion_mask = create_criterion(CFG.criterion, classes=3, smoothing=0.05) criterion_gender = create_criterion('cross_entropy') criterion_age = create_criterion(CFG.criterion, classes=3, smoothing=0.05) # get optimizer from optimizer.py and define with parameters optimizer_backbone = create_optimizer( CFG.optimizer, params=model.backbone.parameters(), lr = CFG.learning_rate * 0.1, momentum=0.9, weight_decay=1e-2 ) optimizer_classifier = create_optimizer( CFG.optimizer, params=[ {"params": model.mask_layer.parameters()}, {"params": model.gender_layer.parameters()}, {"params": model.age_layer.parameters()}, ], lr = CFG.learning_rate, momentum=0.9, weight_decay=1e-2 ) # get scheduler from scheduler.py and define with parameters scheduler_backbone = create_scheduler( CFG.scheduler, optimizer=optimizer_backbone, max_lr=CFG.learning_rate * 0.1, epochs=CFG.nepochs, steps_per_epoch=len(train_iter), pct_start=5/CFG.nepochs, anneal_strategy='cos' ) scheduler_classifier = create_scheduler( CFG.scheduler, optimizer=optimizer_classifier, max_lr=CFG.learning_rate, epochs=CFG.nepochs, steps_per_epoch=len(train_iter), pct_start=5/CFG.nepochs, anneal_strategy='cos' ) return model, criterion_mask, criterion_gender, criterion_age, optimizer_backbone, optimizer_classifier, scheduler_backbone, scheduler_classifier
def train(data_dir, model_dir, args): seed_everything(args.seed) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # MaskBaseDataset dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio) num_classes = dataset.num_classes # 18 # -- augmentation transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=8, shuffle=True, pin_memory=use_cuda, drop_last=True) val_loader = DataLoader(val_set, batch_size=args.batch_size, num_workers=8, shuffle=False, pin_memory=use_cuda, drop_last=True) # -- model models = [] model_module_gender = getattr(import_module("model"), args.model_gender) # default: BaseModel model_gender = model_module_gender(num_classes=args.num_classes_gender, grad_point=args.grad_point).to(device) model_gender = torch.nn.DataParallel(model_gender) # -- loss & metric criterion_gender = create_criterion( args.criterion_gender, classes=args.num_classes_gender) # default: f1 if args.optimizer == "AdamP": optimizer_gender = AdamP(filter(lambda p: p.requires_grad, model_gender.parameters()), lr=args.lr, weight_decay=5e-4) else: opt_module = getattr(import_module('torch.optim'), args.optimizer) # default: Adam optimizer_gender = opt_module(filter(lambda p: p.requires_grad, model_gender.parameters()), lr=args.lr, weight_decay=5e-4) scheduler_gender = StepLR(optimizer_gender, args.lr_decay_step, gamma=0.5) # -- logging logger_gender = SummaryWriter(log_dir=os.path.join(save_dir, 'gender')) with open(Path(save_dir) / 'gender' / 'config.json', 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc_gender = 0 best_val_loss_gender = np.inf for epoch in range(args.epochs): # train loop model_gender.train() loss_value_gender = 0 matches_gender = 0 for idx, train_batch in enumerate(train_loader): inputs, labels_mask, labels_gender, labels_age = train_batch inputs = inputs.to(device) labels_gender = labels_gender.to(device) optimizer_gender.zero_grad() outs_gender = model_gender(inputs) preds_gender = torch.argmax(outs_gender, dim=-1) loss_gender = criterion_gender(outs_gender, labels_gender) loss_gender.backward() optimizer_gender.step() loss_value_gender += loss_gender.item() matches_gender += (preds_gender == labels_gender).sum().item() if (idx + 1) % args.log_interval == 0: train_loss_gender = loss_value_gender / args.log_interval train_acc_gender = matches_gender / args.batch_size / args.log_interval current_lr_gender = get_lr(optimizer_gender) print( f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss_gender:4.4} || training accuracy {train_acc_gender:4.2%} || lr {current_lr_gender}" ) logger_gender.add_scalar("Train/loss", train_loss_gender, epoch * len(train_loader) + idx) logger_gender.add_scalar("Train/accuracy", train_acc_gender, epoch * len(train_loader) + idx) loss_value_gender = 0 matches_gender = 0 scheduler_gender.step() #val loop with torch.no_grad(): print("Calculating validation results...") model_gender.eval() val_loss_items_gender = [] val_acc_items_gender = [] figure = None for val_batch in val_loader: inputs, labels_mask, labels_gender, labels_age = val_batch inputs = inputs.to(device) labels_gender = labels_gender.to(device) outs_gender = model_gender(inputs) preds_gender = torch.argmax(outs_gender, dim=-1) loss_item_gender = criterion_gender(outs_gender, labels_gender).item() acc_item_gender = (labels_gender == preds_gender).sum().item() val_loss_items_gender.append(loss_item_gender) val_acc_items_gender.append(acc_item_gender) if figure is None: # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy() inputs_np = torch.clone(inputs).detach().cpu() inputs_np = inputs_np.permute(0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels_mask, preds_gender, args.dataset != "MaskSplitByProfileDataset") plt.show() val_loss_gender = np.sum(val_loss_items_gender) / len(val_loader) val_acc_gender = np.sum(val_acc_items_gender) / len(val_set) if val_loss_gender < best_val_loss_gender or val_acc_gender > best_val_acc_gender: save_model(model_gender, epoch, val_loss_gender, val_acc_gender, os.path.join(save_dir, "gender"), args.model_gender) if val_loss_gender < best_val_loss_gender and val_acc_gender > best_val_acc_gender: print( f"New best model_gender for val acc and val loss : {val_acc_gender:4.2%} {val_loss_gender:4.2}! saving the best model_gender.." ) best_val_loss_gender = val_loss_gender best_val_acc_gender = val_acc_gender elif val_loss_gender < best_val_loss_gender: print( f"New best model_gender for val loss : {val_loss_gender:4.2}! saving the best model_gender.." ) best_val_loss_gender = val_loss_gender elif val_acc_gender > best_val_acc_gender: print( f"New best model_gender for val accuracy : {val_acc_gender:4.2%}! saving the best model_gender.." ) best_val_acc_gender = val_acc_gender print( f"[Val] acc: {val_acc_gender:4.2%}, loss: {val_loss_gender:4.2} || " f"best acc: {best_val_acc_gender:4.2%}, best loss: {best_val_loss_gender:4.2}" ) logger_gender.add_scalar("Val/loss", val_loss_gender, epoch) logger_gender.add_scalar("Val/accuracy", val_acc_gender, epoch) logger_gender.add_figure("results", figure, epoch) print()
def train(data_dir, model_dir, args): seed_everything(args.seed) # args.__dict__ == vars(args) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # MaskBaseDataset dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio) num_classes = dataset.num_classes # 18 # -- augmentation transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=8, shuffle=True, pin_memory=use_cuda, drop_last=True) val_loader = DataLoader(val_set, batch_size=args.batch_size, num_workers=8, shuffle=False, pin_memory=use_cuda, drop_last=True) # -- model model_module = getattr(import_module("model"), args.model) # default: BaseModel model = model_module(num_classes=num_classes, grad_point=args.grad_point).to(device) model = torch.nn.DataParallel(model) # if want model train begin from args.continue_epoch checkpoint. if args.continue_train: try_dir = find_dir_try(args.continue_try_num, model_dir, args.continue_name) epoch_dir = find_dir_epoch(args.continue_epoch, try_dir) model.load_state_dict(torch.load(epoch_dir)) # -- loss & metric if args.criterion == "cross_entropy": criterion = create_criterion(args.criterion) # default: cross_entropy else: criterion = create_criterion( args.criterion, classes=num_classes) # default: cross_entropy if args.optimizer == "AdamP": optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4) else: opt_module = getattr(import_module('torch.optim'), args.optimizer) # default: Adam optimizer = opt_module(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4) scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) # -- logging if not os.path.exists(save_dir): os.mkdir(save_dir) with open(Path(save_dir) / 'config.json', 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() loss_value = 0 matches = 0 for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) loss.backward() optimizer.step() loss_value += loss.item() matches += (preds == labels).sum().item() if (idx + 1) % args.log_interval == 0: train_loss = loss_value / args.log_interval train_acc = matches / args.batch_size / args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}" ) loss_value = 0 matches = 0 scheduler.step() #val loop with torch.no_grad(): print("Calculating validation results...") model.eval() val_loss_items = [] val_acc_items = [] figure = None for val_batch in val_loader: inputs, labels = val_batch inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss_item = criterion(outs, labels).item() acc_item = (labels == preds).sum().item() val_loss_items.append(loss_item) val_acc_items.append(acc_item) if figure is None: # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy() inputs_np = torch.clone(inputs).detach().cpu() inputs_np = inputs_np.permute(0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels, preds, args.dataset != "MaskSplitByProfileDataset") plt.show() val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_set) if val_loss < best_val_loss or val_acc > best_val_acc: save_model(model, epoch, val_loss, val_acc, save_dir, args.model) if val_loss < best_val_loss and val_acc > best_val_acc: print( f"New best model for val acc and val loss : {val_acc:4.2%} {val_loss:4.2}! saving the best model.." ) best_val_loss = val_loss best_val_acc = val_acc elif val_loss < best_val_loss: print( f"New best model for val loss : {val_loss:4.2}! saving the best model.." ) save_model(model, epoch, val_loss, val_acc, save_dir, args.model) best_val_loss = val_loss elif val_acc > best_val_acc: print( f"New best model for val accuracy : {val_acc:4.2%}! saving the best model.." ) save_model(model, epoch, val_loss, val_acc, save_dir, args.model) best_val_acc = val_acc print( f"[Val] acc: {val_acc:4.2%}, loss: {val_loss:4.2} || " f"best acc: {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}" ) print()
def train(self): train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader( self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size, ) if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon, ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", self.args.logging_steps) logger.info(" Save steps = %d", self.args.save_steps) global_step = 0 tr_loss = 0.0 self.model.zero_grad() train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") criterion1 = create_criterion('cross_entropy') criterion2 = create_criterion('f1') criterion3 = create_criterion('focal') criterion4 = create_criterion('label_smoothing') for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() # batch = tuple(t.to(self.device) for t in batch) # GPU or CPU # inputs = { # "input_ids": batch[0], # "attention_mask": batch[1], # "token_type_ids": batch[2], # "labels": batch[3], # "e1_mask": batch[4], # "e2_mask": batch[5], # } # outputs = self.model(**inputs) # loss = outputs[0] batch = tuple(t.to(self.device) for t in batch) # GPU or CPU outputs = self.model(input_ids=batch[0], attention_mask=batch[1], e1_mask=batch[4], e2_mask=batch[5]) _, preds = torch.max(outputs, 1) loss1 = criterion3(outputs, batch[3]) loss2 = criterion4(outputs, batch[3]) loss = loss1 + loss2 if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0: self.evaluate("test") # There is no dev set for semeval task # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: # self.save_model() if 0 < self.args.max_steps < global_step: epoch_iterator.close() break if 0 < self.args.max_steps < global_step: train_iterator.close() break return global_step, tr_loss / global_step
def evaluate(self, mode): # We use test dataset because semeval doesn't have dev dataset if mode == "test": dataset = self.test_dataset elif mode == "dev": dataset = self.dev_dataset else: raise Exception("Only dev and test dataset available") eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size) # Eval! logger.info("***** Running evaluation on %s dataset *****", mode) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", self.args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None criterion1 = create_criterion('cross_entropy') criterion2 = create_criterion('f1') criterion3 = create_criterion('focal') criterion4 = create_criterion('label_smoothing') self.model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): # inputs = { # "input_ids": batch[0], # "attention_mask": batch[1], # "token_type_ids": batch[2], # "labels": batch[3], # "e1_mask": batch[4], # "e2_mask": batch[5], # } # outputs = self.model(**inputs) # tmp_eval_loss, logits = outputs[:2] # print(batch) logits = self.model(input_ids=batch[0], attention_mask=batch[1], e1_mask=batch[4], e2_mask=batch[5]) loss1 = criterion3(logits, batch[3]) loss2 = criterion4(logits, batch[3]) tmp_eval_loss = loss1 + loss2 eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() # out_label_ids = inputs["labels"].detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) # out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps results = {"loss": eval_loss} preds = np.argmax(preds, axis=1) write_prediction(self.args, os.path.join(self.args.eval_dir, "proposed_answers.txt"), preds) result = compute_metrics(preds, out_label_ids) print(f'evaluate acc:{result}') results.update(result) logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" {} = {:.4f}".format(key, results[key])) return results
def main(args): seed_everything(21) load_dotenv() if WANDB: if args.ENCODER: run_name = args.MODEL + "_" + args.ENCODER else: run_name = args.MODEL if args.KFOLD > 1: if args.KFOLD != 5: print("Only 5 KFOLD is available") return # pt 저장 폴더 생성 path_pair = args.MODEL_PATH.split(".") os.makedirs(path_pair[0], exist_ok=True) # 재사용위해 args 복사 args_origin = copy.deepcopy(args) for fold in range(args.KFOLD): # hold-out, kfold에 따라서 dataloader 다르게 설정 if args.KFOLD > 1: args = copy.deepcopy(args_origin) path_pair = args_origin.MODEL_PATH.split(".") # MODEL_PATH 변경 args.MODEL_PATH = (path_pair[0] + f"/kfold_{fold+1}." + path_pair[1]) # wandb if WANDB: wandb.init( project=os.environ.get("WANDB_PROJECT_NAME"), name=run_name + f"_k{fold+1}", config=args, reinit=True, ) args = wandb.config # dataloader dataloader = get_dataloader(args.BATCH_SIZE, fold_index=fold) print(f"\nfold {fold+1} start") else: # wandb if WANDB: wandb.init( project=os.environ.get("WANDB_PROJECT_NAME"), name=run_name, reinit=True, ) wandb.config.update(args) args = wandb.config # dataloader dataloader = get_dataloader(args.BATCH_SIZE) print("Get loader") model = get_model(args.MODEL, args.ENCODER).to(args.device) print("Load model") if WANDB: wandb.watch(model) criterion = [] if "+" in args.LOSS: criterion.append("+") criterion.append(create_criterion(args.LOSS.split("+")[0])) criterion.append(create_criterion(args.LOSS.split("+")[1])) elif "-" in args.LOSS: criterion.append("-") criterion.append(create_criterion(args.LOSS.split("-")[0])) criterion.append(create_criterion(args.LOSS.split("-")[1])) else: criterion.append("0") criterion.append(create_criterion(args.LOSS)) optimizer = create_optimizer(args.OPTIMIZER, model, args.LEARNING_RATE) if args.SCHEDULER: scheduler = create_scheduler(args.SCHEDULER, optimizer) else: scheduler = None # optimizer = optim.Adam(params = model.parameters(), lr = args.LEARNING_RATE, weight_decay=1e-6) print("Run") run(args, model, criterion, optimizer, dataloader, fold, scheduler)
def get_model(): # model.py에 정의된 특정 모델을 가져옵니다. model_module = getattr(import_module("recycle_model"), CFG.model) model = model_module(num_classes=12) # 모델의 파라미터를 GPU메모리로 옮깁니다. model.cuda() # wandb에서 model 감독 wandb.watch(model) # 모델의 파라미터 수를 출력합니다. print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다. n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # loss.py에 정의된 criterion을 가져옵니다. criterion = create_criterion(CFG.criterion) # optimizer.py에 정의된 optimizer를 가져옵니다. if CFG.optimizer == "Adam": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, weight_decay=1e-6) elif CFG.optimizer == "RAdam": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) elif CFG.optimizer == "AdamP": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) elif CFG.optimizer == "AdamW": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, amsgrad=True) elif CFG.optimizer == "RMSprop": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate) # scheduler.py에 정의된 scheduler를 가져옵니다. if CFG.scheduler == "StepLR": scheduler = create_scheduler(CFG.scheduler, optimizer=optimizer, step_size=5, gamma=0.95) elif CFG.scheduler == "CosineAnnealingWarmupRestarts": scheduler = create_scheduler( CFG.scheduler, optimizer=optimizer, first_cycle_steps=5, cycle_mult=1., max_lr=1e-4, min_lr=1e-7, ) return model, criterion, optimizer, scheduler
def train(data_dir, model_dir, args): seed_everything(args.seed) #def: 42 save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # default: MaskSplitbyProfile dataset = dataset_module(data_dir=data_dir, ) num_classes = dataset.num_classes # 18 # -- data_loader #train_csv = pd.read_csv("/opt/ml/input/data/train/train.csv") labels = [] print("Get Labels from dataset...") for i in tqdm(range(len(dataset))): _, label = dataset[i] labels.append(label) labels = np.array(labels) # -- augmentation ''' transform_module = getattr(import_module("dataset"), args.augmentation) # default: BaseAugmentation transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) ''' ''' stratifiedkfold = StratifiedKFold(n_splits = 5,random_state = 42, shuffle = True) folds = [] print("Total img counts : ", len(labels)) for fold_index, (train_idx, valid_idx) in tqdm(enumerate(stratifiedkfold.split(range(len(labels)), labels))) : folds.append({'train' : train_idx, 'valid' : valid_idx}) print() print(f'[fold: {fold_index+1}, total fold: {len(folds)}]') print(len(train_idx), len(valid_idx)) print(train_idx) print(valid_idx) for fold in folds : train_subset = Subset(dataset=dataset, indices=train_idx) valid_subset = Subset(dataset=dataset, indices=valid_idx) train_loader = DataLoader(dataset=train_subset, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=use_cuda, drop_last=True, ) val_loader = DataLoader(dataset=valid_subset, batch_size=args.valid_batch_size, shuffle=True, num_workers=4, pin_memory=use_cuda, drop_last=True, ) ''' train_set, val_set = dataset.split_dataset() train_loader = DataLoader( train_set, batch_size=args.batch_size, num_workers=4, shuffle=True, pin_memory=use_cuda, drop_last=True, ) val_loader = DataLoader( val_set, batch_size=args.valid_batch_size, num_workers=4, shuffle=False, pin_memory=use_cuda, drop_last=True, ) # -- model model_module = getattr(import_module("model"), args.model) # default: BaseModel model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) #torch.nn.DataParallel : https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html # -- loss & metric #criterion = create_criterion(args.criterion) # default: cross_entropy df_label = pd.Series(labels) label_sorted = df_label.value_counts().sort_index() n_label = torch.Tensor(label_sorted.values) gamma = 2 normed_weights = [1 - (gamma * x / sum(n_label)) for x in n_label] normed_weights = torch.FloatTensor(normed_weights).to(device) criterion = torch.nn.CrossEntropyLoss(weight=normed_weights) criterion = create_criterion(args.criterion) #optimizer = madgrad.MADGRAD(params : any, lr = 0.001, momentum = 0.9, weight_decay = 0, eps = 1e-06) try: opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD except AttributeError: opt_module = getattr(import_module("madgrad"), args.optimizer) optimizer = opt_module(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=0) #scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=4, eta_min=0.000005) scheduler = StepLR(optimizer, args.lr_decay_step, gamma=args.gamma) # -- logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 #train starts for epoch in tqdm(range(args.epochs)): # train loop print() model.train() loss_value = 0 matches = 0 for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) + criterion2(outs, labels) #loss = criterion(outs, labels) loss.backward() optimizer.step() loss_value += loss.item() matches += (preds == labels).sum().item() labels = labels.cpu().detach().numpy() preds = preds.cpu().detach().numpy() train_f1 = f1_score(labels, preds, average='macro') if (idx + 1) % args.log_interval == 0: train_loss = loss_value / args.log_interval train_acc = matches / args.batch_size / args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch+1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.4%} || lr {current_lr} || " f"F1_score {train_f1:4.4} ") logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + idx) logger.add_scalar("Train/f1_score", train_f1, epoch * len(train_loader) + idx) loss_value = 0 matches = 0 scheduler.step() #lr scheduler # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() val_loss_items = [] val_acc_items = [] val_f1_items = [] figure = None for val_batch in val_loader: inputs, labels = val_batch inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss_item = criterion(outs, labels).item() loss_item2 = criterion2(outs, labels).item() loss_item = list( np.add(np.array(loss_item), np.array(loss_item2))) acc_item = (labels == preds).sum().item() labels = labels.cpu().detach().numpy() preds = preds.cpu().detach().numpy() f1_item = f1_score(labels, preds, average='macro') val_loss_items.append(loss_item) val_acc_items.append(acc_item) val_f1_items.append(f1_item) if figure is None: inputs_np = torch.clone(inputs).detach().cpu().permute( 0, 2, 3, 1).numpy() inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels, preds, args.dataset != "MaskSplitByProfileDataset") val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_set) val_f1 = np.sum(val_f1_items) / len(val_loader) best_val_acc = max(best_val_acc, val_acc) if val_loss < best_val_loss: print( f"New best model for val_loss : {val_loss:4.4}! saving the best loss model.." ) torch.save( model.module.state_dict(), f"{save_dir}/{args.model}_epoch{epoch}_loss_{val_loss}.pth" ) best_val_loss = val_loss if val_f1 > best_val_f1: print( f"New best model for val_F1_score : {val_f1:4.4}! saving the best F1_score model.." ) torch.save( model.module.state_dict(), f"{save_dir}/{args.model}_epoch{epoch}_f1_{val_f1}.pth") best_val_f1 = val_f1 print( f"[Val] loss: {val_loss:4.4}, F1_score {val_f1:4.4}, acc : {val_acc:4.4%} || " f"best loss: {best_val_loss:4.4}, best_F1_score {best_val_f1:4.4} , best acc : {best_val_acc:4.4%} " ) logger.add_scalar("Val/loss", val_loss, epoch) logger.add_scalar("Val/accuracy", val_acc, epoch) logger.add_scalar("Val/f1_score", val_f1, epoch) logger.add_figure("results", figure, epoch) print()
def train(data_dir, model_dir, args): seed_everything(args.seed) save_dir = increment_path(os.path.join(model_dir, args.name)) # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") info = pd.read_csv('/opt/ml/input/data/train/train.csv') info['gender_age'] = info.apply(lambda x: convert_gender_age(x.gender, x.age), axis = 1) n_fold = int(1 / args.val_ratio) skf = StratifiedKFold(n_splits = n_fold, shuffle=True) info.loc[:, 'fold'] = 0 for fold_num, (train_index, val_index) in enumerate(skf.split(X = info.index, y = info.gender_age.values)): info.loc[info.iloc[val_index].index, 'fold'] = fold_num fold_idx = 0 train = info[info.fold != fold_idx].reset_index(drop=True) val = info[info.fold == fold_idx].reset_index(drop=True) # -- dataset dataset_module = getattr(import_module("dataset"), args.dataset) # default: MaskDataset # -- augmentation train_transform_module = getattr(import_module("dataset"), args.train_augmentation) # default: BaseAugmentation val_transform_module = getattr(import_module("dataset"), args.val_augmentation) # default: BaseAugmentation train_transform = train_transform_module( resize=args.resize, mean=MEAN, std=STD, ) val_transform = val_transform_module( resize=args.resize, mean=MEAN, std=STD, ) print(train_transform.transform, val_transform.transform) if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset': if args.dataset == 'MaskOldDataset': old_transform_module = getattr(import_module('dataset'), args.old_augmentation) old_transform = old_transform_module( resize=args.resize, mean=MEAN, std=STD, ) train_dataset = dataset_module(data_dir, train, train_transform, old_transform) if args.val_old: val_dataset = dataset_module(data_dir, val, val_transform, old_transform) else: val_dataset = dataset_module(data_dir, val, val_transform) else: train_dataset = dataset_module(data_dir, train, train_transform) val_dataset = dataset_module(data_dir, val, val_transform) else: dataset = dataset_module( data_dir=data_dir, ) # dataset.set_transform(transform) # -- data_loader train_set, val_set = dataset.split_dataset() train_dataset = DatasetFromSubset( train_set, transform = train_transform ) val_dataset = DatasetFromSubset( val_set, transform = val_transform ) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True, pin_memory=use_cuda, #drop_last=True, ) val_loader = DataLoader( val_dataset, batch_size=args.valid_batch_size, num_workers=1, shuffle=False, pin_memory=use_cuda, #drop_last=True, ) # -- model model_module = getattr(import_module("model"), args.model) # default: BaseModel model = model_module( num_classes=args.num_classes ).to(device) model = torch.nn.DataParallel(model) # -- loss & metric if args.criterion == 'f1' or args.criterion == 'label_smoothing': criterion = create_criterion(args.criterion, classes = args.num_classes) else: criterion = create_criterion(args.criterion) opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD optimizer = opt_module( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4 ) if args.scheduler == 'cosine': scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6) elif args.scheduler == 'reduce': scheduler = ReduceLROnPlateau(optimizer, factor = 0.5, patience = 5) elif args.scheduler == 'step': scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) else: scheduler = None # -- logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) best_val_acc = 0 best_val_loss = np.inf print("This notebook use [%s]."%(device)) early_stopping = EarlyStopping(patience = args.patience, verbose = True) for epoch in range(args.epochs): # train loop model.train() loss_value = 0 matches = 0 train_loss, train_acc = AverageMeter(), AverageMeter() for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset': labels = labels.argmax(dim = -1) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) loss.backward() optimizer.step() #loss_value += loss.item() #matches += (preds == labels).sum().item() acc = (preds == labels).sum().item() / len(labels) train_loss.update(loss.item(), len(labels)) train_acc.update(acc, len(labels)) if (idx + 1) % args.log_interval == 0: #train_loss = loss_value / args.log_interval #train_acc = matches / args.batch_size / args.log_interval train_f1_acc = f1_score(preds.cpu().detach().type(torch.int), labels.cpu().detach().type(torch.int), average='macro') current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || train_f1_acc {train_f1_acc:.4} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss.avg, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc.avg, epoch * len(train_loader) + idx) loss_value = 0 matches = 0 scheduler.step() val_loss, val_acc = AverageMeter(), AverageMeter() # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() val_labels_items = np.array([]) val_preds_items = np.array([]) figure = None for val_batch in val_loader: inputs, labels = val_batch if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset': labels = labels.argmax(dim = -1) inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) #loss_item = criterion(outs, labels).item() #acc_item = (labels == preds).sum().item() #val_loss_items.append(loss_item) #val_acc_items.append(acc_item) loss = criterion(outs, labels) acc = (preds == labels).sum().item() / len(labels) val_loss.update(loss.item(), len(labels)) val_acc.update(acc, len(labels)) val_labels_items = np.concatenate([val_labels_items, labels.cpu().numpy()]) val_preds_items = np.concatenate([val_preds_items, preds.cpu().numpy()]) if figure is None: if epoch % 2: images, labels, preds = get_all_datas(model, device, val_loader) figure = log_confusion_matrix(labels.cpu().numpy(), np.argmax(preds.cpu().numpy(), axis=1), args.num_classes) # figure2 = plots_result(images.cpu().numpy()[:36], labels.cpu().numpy()[:36], preds.cpu().numpy()[:36], args.num_classes, title="plots_result") else: inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy() inputs_np = val_dataset.denormalize_image(inputs_np, MEAN, STD) figure = grid_image(inputs_np, labels, preds, 9, False) # val_loss = np.sum(val_loss_items) / len(val_loader) # val_acc = np.sum(val_acc_items) / len(val_set) val_f1_acc = f1_score(val_labels_items.astype(np.int), val_preds_items.astype(np.int), average='macro') best_val_acc = max(best_val_acc, val_acc.avg) # best_val_loss = min(best_val_loss, val_loss) if val_loss.avg < best_val_loss: print(f"New best model for val loss : {val_loss.avg:4.2%}! saving the best model..") torch.save(model.module.state_dict(), f"{save_dir}/best.pth") best_val_loss = val_loss.avg torch.save(model.module.state_dict(), f"{save_dir}/last.pth") print( f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || val_f1_acc : {val_f1_acc:.4} || " f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss.avg, epoch) logger.add_scalar("Val/accuracy", val_acc.avg, epoch) logger.add_figure("results", figure, epoch) # logger.add_figure("results1", figure2, epoch) early_stopping(val_loss.avg, model) if early_stopping.early_stop: print('Early stopping...') break print()
def train_model(config, wandb): seed_everything(config.seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_module = getattr(import_module("model"), config.model) model = model_module(num_classes=18).to(device) #model = torch.nn.DataParallel(model) ######## DataSet transform = DataAugmentation(type=config.transform) #center_384_1 dataset = MaskDataset(config.data_dir, transform=transform) len_valid_set = int(config.data_ratio * len(dataset)) len_train_set = len(dataset) - len_valid_set dataloaders, batch_num = {}, {} train_dataset, valid_dataset = torch.utils.data.random_split( dataset, [len_train_set, len_valid_set]) if config.random_split == 0: print("tbd") sampler = None if config.sampler == 'ImbalancedDatasetSampler': sampler = ImbalancedDatasetSampler(train_dataset) use_cuda = torch.cuda.is_available() dataloaders['train'] = torch.utils.data.DataLoader( train_dataset, batch_size=config.batch_size, sampler=sampler, shuffle=False, num_workers=4, pin_memory=use_cuda) dataloaders['valid'] = torch.utils.data.DataLoader( valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=use_cuda) batch_num['train'], batch_num['valid'] = len(dataloaders['train']), len( dataloaders['valid']) #Loss criterion = create_criterion(config.criterion) #Optimizer optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=0.9) if config.optim == "AdamP": optimizer = AdamP(model.parameters(), lr=config.lr, betas=(0.9, 0.999), weight_decay=config.weight_decay) elif config.optim == "AdamW": optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) #Scheduler # Decay LR by a factor of 0.1 every 7 epochs #exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) if config.lr_scheduler == "cosine": print('cosine') Q = math.floor(len(train_dataset) / config.batch_size + 1) * config.epochs / 7 scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q) #ConsineAnnealingWarmRestarts since = time.time() low_train = 0 best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 train_loss, train_acc, valid_loss, valid_acc = [], [], [], [] num_epochs = config.epochs for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'valid']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss, running_corrects, num_cnt = 0.0, 0, 0 runnnig_f1 = 0 # Iterate over data. idx = 0 for inputs, labels in dataloaders[phase]: idx += 1 inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() else: runnnig_f1 += f1_score(labels.data.detach().cpu(), preds.detach().cpu(), average='macro') # statistics val_loss = loss.item() * inputs.size(0) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) num_cnt += len(labels) if idx % 100 == 0: _loss = loss.item() / config.batch_size print( f"Epoch[{epoch}/{config.epochs}]({idx}/{batch_num[phase]}) || " f"{phase} loss {_loss:4.4} ") if phase == 'train': scheduler.step() epoch_loss = float(running_loss / num_cnt) epoch_acc = float( (running_corrects.double() / num_cnt).cpu() * 100) epoch_f1 = float(runnnig_f1 / num_cnt) if phase == 'train': train_loss.append(epoch_loss) train_acc.append(epoch_acc) if config.wandb: wandb.log({"Train acc": epoch_acc}) else: valid_loss.append(epoch_loss) valid_acc.append(epoch_acc) if config.wandb: wandb.log({"Valid acc": epoch_acc}) wandb.log({"F1 Score": epoch_f1}) print('{} Loss: {:.2f} Acc: {:.1f} f1 :{:.3f}'.format( phase, epoch_loss, epoch_acc, epoch_f1)) # deep copy the model if phase == 'valid': if epoch_acc > best_acc: best_idx = epoch best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) print('==> best model saved - %d / %.1f' % (best_idx, best_acc)) low_train = 0 elif epoch_acc < best_acc: print('==> model finish') low_train += 1 if low_train > 0 and epoch > 4: break if phase == 'valid': if epoch_acc < 80: print('Stop valid is so low') break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best valid Acc: %d - %.1f' % (best_idx, best_acc)) # load best model weights model.load_state_dict(best_model_wts) #torch.save(model.state_dict(), 'mask_model.pt') torch.save(model.state_dict(), config.name + '.pt') print('model saved') if config.wandb: wandb.finish() return model, best_idx, best_acc, train_loss, train_acc, valid_loss, valid_acc
def train(data_dir, model_dir, args): seed_everything(args.seed) s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \ '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize if args.name: s_dir += '-' + args.name save_dir = increment_path(os.path.join(model_dir, s_dir)) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) model.dropout = nn.Dropout(p=args.drop) model.to(device) summary(model) # loss & optimizer if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross': criterion = create_criterion(args.criterion, classes=args.num_labels, smoothing=0.1) else: criterion = create_criterion(args.criterion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.optimizer == 'AdamP': optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD optimizer = opt_module( optimizer_grouped_parameters, lr=args.lr, ) # logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) set_neptune(save_dir, args) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # train, val split kfold = StratifiedKFold(n_splits=5) for train_idx, val_idx in kfold.split(dataset, labels): train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[ val_idx] break tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True, pin_memory=use_cuda, ) val_loader = DataLoader( RE_val_dataset, batch_size=12, num_workers=1, shuffle=False, pin_memory=use_cuda, ) if args.scheduler == 'cosine': scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6) elif args.scheduler == 'reduce': scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) elif args.scheduler == 'step': scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) elif args.scheduler == 'cosine_warmup': t_total = len(train_loader) * args.epochs warmup_step = int(t_total * args.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) else: scheduler = None print("Training Start!!!") best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() train_loss, train_acc = AverageMeter(), AverageMeter() for idx, train_batch in enumerate(train_loader): optimizer.zero_grad() try: inputs, token_types, attention_mask, labels = train_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = train_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7) optimizer.step() if scheduler: scheduler.step() neptune.log_metric('learning_rate', get_lr(optimizer)) train_loss.update(loss.item(), len(labels)) train_acc.update(acc, len(labels)) if (idx + 1) % args.log_interval == 0: current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss.avg, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc.avg, epoch * len(train_loader) + idx) neptune.log_metric(f'Train_loss', train_loss.avg) neptune.log_metric(f'Train_avg', train_acc.avg) neptune.log_metric('learning_rate', current_lr) val_loss, val_acc = AverageMeter(), AverageMeter() # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() for val_batch in val_loader: try: inputs, token_types, attention_mask, labels = val_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = val_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) val_loss.update(loss.item(), len(labels)) val_acc.update(acc, len(labels)) if val_acc.avg > best_val_acc: print( f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.." ) torch.save(model.state_dict(), f"{save_dir}/best.pth") best_val_acc = val_acc.avg best_val_loss = min(best_val_loss, val_loss.avg) print( f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || " f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss.avg, epoch) logger.add_scalar("Val/accuracy", val_acc.avg, epoch) neptune.log_metric(f'Val_loss', val_loss.avg) neptune.log_metric(f'Val_avg', val_acc.avg) print()
def train_no_val(img_dir, model_dir, args): seed_everything(args.seed) start = time.time() get_current_time() save_dir = increment_path(os.path.join(model_dir, args.name)) # settings device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dataset dataset_module = getattr(import_module("dataset"), args.dataset) dataset = dataset_module( img_dir=img_dir, val_ratio=args.val_ratio, ) num_classes = dataset.num_classes transform_module = getattr(import_module("dataset"), args.augmentation) transform = transform_module(mean=dataset.mean, std=dataset.std) dataset.set_transform(transform["train"]) train_loader = DataLoader( dataset, batch_size=args.batch_size, num_workers=2, shuffle=True, pin_memory=torch.cuda.is_available(), drop_last=True, ) model_module = getattr(import_module("model"), args.model) model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) criterion = create_criterion(args.criterion) optimizer = None if args.optimizer == "AdamP": optimizer = AdamP(model.parameters()) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) optimizer = opt_module( model.parameters(), # filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, # weight_decay=5e-4, ) # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) logger = SummaryWriter(log_dir=save_dir) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 for epoch in range(args.epochs): model.train() train_loss = 0 train_acc = 0 train_f1 = 0 for i, data in enumerate(tqdm(train_loader)): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) optimizer.zero_grad() outputs = model(imgs) loss = criterion(outputs, labels) loss.backward() optimizer.step() preds = torch.argmax(outputs, 1) acc = (preds == labels).sum().item() / len(imgs) t_f1_score = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) train_loss += loss train_acc += acc train_f1 += t_f1_score if (i + 1) % args.log_interval == 0: train_loss /= args.log_interval train_acc /= args.log_interval train_f1 /= args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + i) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + i) logger.add_scalar("Train/F1-score", train_f1, epoch * len(train_loader) + i) train_loss = 0 train_acc = 0 train_f1 = 0 torch.save(model.module.state_dict(), f"{save_dir}/last.pth") # How much time training taken times = time.time() - start minute, sec = divmod(times, 60) print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
def train(data_dir, model_dir, args): seed_everything(args.seed) save_dir = increment_path(os.path.join(model_dir, args.name)) # gpu setting use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # load dataset dataset_module = getattr(import_module("dataset"), args.dataset) dataset = dataset_module(data_dir=data_dir, ) num_classes = dataset.num_classes # 18 # apply tranform to dataset transform_module = getattr(import_module("dataset"), args.augmentation) transform = transform_module( resize=args.resize, mean=dataset.mean, std=dataset.std, ) dataset.set_transform(transform) # create dataloader train_set, val_set = dataset.split_dataset() train_loader = DataLoader( train_set, batch_size=args.batch_size, num_workers=1, shuffle=True, pin_memory=use_cuda, drop_last=True, ) val_loader = DataLoader( val_set, batch_size=args.valid_batch_size, num_workers=1, shuffle=False, pin_memory=use_cuda, drop_last=True, ) # create model model_module = getattr(import_module("model"), args.model) model = model_module(num_classes=num_classes).to(device) # load weights of pretrained model weight_path = f"{model_dir}/efficientnetb4_sgd2/last.pth" model.load_state_dict(torch.load(weight_path)) model = torch.nn.DataParallel(model) # create criterion, optimizer and scheduler criterion = create_criterion(args.criterion) if args.optimizer == "madgrad": opt_module = MADGRAD else: opt_module = getattr(import_module("torch.optim"), args.optimizer) optimizer = opt_module( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=5e-4, ) scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 for epoch in range(args.epochs): # training a model model.train() loss_value = 0 matches = 0 for idx, train_batch in enumerate(train_loader): inputs, labels = train_batch inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss = criterion(outs, labels) loss.backward() optimizer.step() loss_value += loss.item() matches += (preds == labels).sum().item() if (idx + 1) % args.log_interval == 0: train_loss = loss_value / args.log_interval train_acc = matches / args.batch_size / args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch+1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}" ) loss_value = 0 matches = 0 scheduler.step() # validate a model with torch.no_grad(): print("Calculating validation results...") model.eval() val_loss_items = [] val_acc_items = [] val_f1_items = [] figure = None for val_batch in val_loader: inputs, labels = val_batch inputs = inputs.to(device) labels = labels.to(device) outs = model(inputs) preds = torch.argmax(outs, dim=-1) loss_item = criterion(outs, labels).item() acc_item = (labels == preds).sum().item() f1_item = f1_score(labels.cpu().numpy(), preds.cpu().numpy(), average="macro") val_loss_items.append(loss_item) val_acc_items.append(acc_item) val_f1_items.append(f1_item) if figure is None: inputs_np = (torch.clone(inputs).detach().cpu().permute( 0, 2, 3, 1).numpy()) inputs_np = dataset_module.denormalize_image( inputs_np, dataset.mean, dataset.std) figure = grid_image( inputs_np, labels, preds, args.dataset != "MaskSplitByProfileDataset", ) val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_set) val_f1 = np.sum(val_f1_items) / len(val_loader) best_val_loss = min(best_val_loss, val_loss) if val_acc > best_val_acc: best_val_acc = val_acc # update the minimum loss if val_f1 > best_val_f1: print( f"New best model for val f1 : {val_f1:4.2f}! saving the best model.." ) # save the model torch.save(model.module.state_dict(), f"{save_dir}/best_eph_{epoch}.pth") best_val_f1 = val_f1 # update the maximum f1 score torch.save(model.module.state_dict(), f"{save_dir}/last.pth") print( f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2}, f1: {val_f1:4.2f} || " f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}, best f1: {best_val_f1:4.2f}" ) print()
def train(img_dir, model_dir, args): seed_everything(args.seed) start = time.time() get_current_time() save_dir = increment_path(os.path.join(model_dir, args.name)) # settings device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dataset dataset_module = getattr(import_module("dataset"), args.dataset) dataset = dataset_module( img_dir=img_dir, val_ratio=args.val_ratio, ) num_classes = dataset.num_classes transform_module = getattr(import_module("dataset"), args.augmentation) transform = transform_module(mean=dataset.mean, std=dataset.std) train_dataset, val_dataset = dataset.split_dataset() train_dataset.dataset.set_transform(transform["train"]) val_dataset.dataset.set_transform(transform["val"]) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, num_workers=2, shuffle=True, pin_memory=torch.cuda.is_available(), drop_last=True, ) val_loader = DataLoader( val_dataset, batch_size=args.valid_batch_size, num_workers=2, shuffle=False, pin_memory=torch.cuda.is_available(), drop_last=True, ) model_module = getattr(import_module("model"), args.model) model = model_module(num_classes=num_classes).to(device) model = torch.nn.DataParallel(model) criterion = create_criterion(args.criterion) optimizer = None if args.optimizer == "AdamP": optimizer = AdamP(model.parameters(), lr=args.lr) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) optimizer = opt_module( model.parameters(), # filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, # weight_decay=5e-4, ) # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) logger = SummaryWriter(log_dir=save_dir) best_val_acc = 0 best_val_loss = np.inf best_val_f1 = 0 for epoch in range(args.epochs): model.train() train_loss = 0 train_acc = 0 train_f1 = 0 for i, data in enumerate(tqdm(train_loader)): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) optimizer.zero_grad() outputs = model(imgs) loss = criterion(outputs, labels) loss.backward() optimizer.step() preds = torch.argmax(outputs, 1) acc = (preds == labels).sum().item() / len(imgs) t_f1_score = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) train_loss += loss train_acc += acc train_f1 += t_f1_score if (i + 1) % args.log_interval == 0: train_loss /= args.log_interval train_acc /= args.log_interval train_f1 /= args.log_interval current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + i) logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + i) logger.add_scalar("Train/F1-score", train_f1, epoch * len(train_loader) + i) train_loss = 0 train_acc = 0 train_f1 = 0 # scheduler.step() # training은 1 epoch이 끝나야 완료된 것 # 학습이 끝난 각 epoch에서 최고의 score를 가진 것을 저장하는 것 with torch.no_grad(): print("Validation step---------------------") model.eval() val_loss_items = [] val_acc_items = [] val_f1_items = [] for data in tqdm(val_loader): imgs, labels = data imgs = imgs.float().to(device) labels = labels.long().to(device) outputs = model(imgs) preds = torch.argmax(outputs, 1) loss = criterion(outputs, labels).item() acc = (labels == preds).sum().item() val_f1 = f1_score( labels.cpu().detach().numpy(), preds.cpu().detach().numpy(), average="macro", ) val_loss_items.append(loss) val_acc_items.append(acc) val_f1_items.append(val_f1) val_loss = np.sum(val_loss_items) / len(val_loader) val_acc = np.sum(val_acc_items) / len(val_dataset) val_f1 = np.sum(val_f1_items) / len(val_loader) print( f"val_loader: {len(val_loader)} | val_dataset: {len(val_dataset)}" ) best_val_loss = min(best_val_loss, val_loss) best_val_f1 = max(val_f1, best_val_f1) best_val_acc = max(val_acc, best_val_acc) # if val_acc > best_val_acc: # print( # f"New best model for val acc: {val_acc:4.2%}! saving the best model..." # ) # torch.save(model.module.state_dict(), f"{save_dir}/best.pth") # best_val_acc = val_acc if val_f1 > best_val_f1: print( f"New best model for val f1: {val_f1:.4f}! saving the best model..." ) torch.save(model.module.state_dict(), f"{save_dir}/best.pth") best_val_f1 = val_f1 # TODO: last model 저장이 여기 위치가 맞나 ?? # torch.save(model.module.state_dict(), f"{save_dir}/last.pth") print( f"[Val] acc: {val_acc:.4f}, loss: {val_loss:.4f} || best acc: {best_val_acc:.4f}, best loss: {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss, epoch) logger.add_scalar("Val/accuracy", val_acc, epoch) logger.add_scalar("Val/f1-score", val_f1, epoch) print() torch.save(model.module.state_dict(), f"{save_dir}/last.pth") # How much time training taken times = time.time() - start minute, sec = divmod(times, 60) print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
def start(config, wandb): # Loss function 정의 dataset_path = '/opt/ml/input/data' test_path = dataset_path + '/test.json' num = config.data_ratio if num == -1: train_path = dataset_path + '/train.json' val_path = dataset_path + '/val.json' else: train_path = dataset_path + '/train_data' + str(num) + '.json' val_path = dataset_path + '/valid_data' + str(num) + '.json' print(train_path) print(val_path) seed_everything(config.seed) device = "cuda" if torch.cuda.is_available() else "cpu" print('pytorch version: {}'.format(torch.__version__)) print('GPU 사용 가능 여부: {}'.format(torch.cuda.is_available())) print(torch.cuda.get_device_name(0)) print(torch.cuda.device_count()) train_transform = getattr(import_module("dataset"), "data_" + config.transform)() train_dataset = CustomDataLoader(data_dir=train_path, mode='train', transform=train_transform) #train_dataset=CutMix(train_dataset, num_class=12, beta=1.0, prob=0.5, num_mix=2) # validation dataset val_transform = getattr(import_module("dataset"), "data_" + config.vtransform)() val_dataset = CustomDataLoader(data_dir=val_path, mode='val', transform=val_transform) batch_size = config.batch_size # DataLoader # create own Dataset 1 (skip) # validation set을 직접 나누고 싶은 경우 # random_split 사용하여 data set을 8:2 로 분할 # train_size = int(0.8*len(dataset)) # val_size = int(len(dataset)-train_size) # dataset = CustomDataLoader(data_dir=train_path, mode='train', transform=transform) # train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=3, drop_last=True, collate_fn=collate_fn) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=2, collate_fn=collate_fn) if config.enc_name == "basic": mode_str = "model." + config.model.lower() model_module = getattr(import_module(mode_str), config.model) model = model_module(num_classes=12).to(device) else: model_module = get_smp_model(config.model, config.enc_name) model = model_module.to(device) #Loss criterion = create_criterion(config.criterion) #criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes = 12)] #Optimizer optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=0.9) if config.optim == "AdamP": optimizer = AdamP(model.parameters(), lr=config.lr, betas=(0.9, 0.999), weight_decay=config.weight_decay) elif config.optim == "AdamW": optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) elif config.optim == "Adam": optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) lookahead = Lookahead(optimizer, k=5, alpha=0.5) # Initialize Lookahead scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) if config.lr_scheduler == "cosine": print('cosine') #Q = 2 Q = config.epochs scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q, eta_min=1e-7) elif config.lr_scheduler == "cosinew": print(" ConsineAnnealingWarmRestarts ") scheduler = lr_scheduler.CosineAnnealingWarmRestarts(lookahead, T_0=30, T_mult=2, eta_min=0) elif config.lr_scheduler == "cosinew_custom": print( "https://gaussian37.github.io/dl-pytorch-lr_scheduler/#cosineannealingwarmrestarts-1" ) optimizer = torch.optim.Adam(model.parameters(), lr=0) lookahead = Lookahead(optimizer, k=5, alpha=0.5) scheduler = CustomCosineAnnealingWarmUpRestarts(optimizer, T_0=config.epochs, T_mult=1, eta_max=config.lr, T_up=8, gamma=0.5) elif config.lr_scheduler == "gradual_warmuplr": print("#https://www.kaggle.com/pukkinming/pytorchgradualwarmuplr") train(model, train_loader, val_loader, criterion, optimizer, scheduler, config, device, lookahead) psudo_labeling(model, train_loader, val_loader, criterion, optimizer, scheduler, config, device, lookahead)