def _predict_fold(self, fold_num, test_df): test_loader = factory.get_dataloader(test_df, self.cfg.data.test) test_preds = np.zeros( (len(test_loader.dataset), self.cfg.model.n_classes * self.cfg.data.test.tta.iter_num)) test_preds_tta = np.zeros((len(test_preds), self.cfg.model.n_classes)) test_batch_size = test_loader.batch_size model = factory.get_nn_model(self.cfg, is_train=False).to(device) model.load_state_dict( torch.load(f'../logs/{self.run_name}/weight_best_{fold_num}.pt')) model.eval() for t in range(self.cfg.data.test.tta.iter_num): with torch.no_grad(): for i, images in enumerate(test_loader): images = images.to(device) preds = model(images.float()) test_preds[i * test_batch_size:(i + 1) * test_batch_size, t * self.cfg.model.n_classes:(t + 1) * self.cfg.model.n_classes] = preds.cpu().detach( ).numpy() for i in range(self.cfg.model.n_classes): preds_col_idx = [ i + self.cfg.model.n_classes * j for j in range(self.cfg.data.test.tta.iter_num) ] test_preds_tta[:, i] = np.mean(test_preds[:, preds_col_idx], axis=1).reshape(-1) return test_preds_tta
def __init__(self, cfg): self.cfg = cfg self.pl_model = LightningModuleSeg(cfg) self.test_loader = get_dataloader(cfg, phase='test') self.fp16 = cfg.General.fp16 self.test_df_path = cfg.Data.dataset.test_df self.labels = cfg.General.labels self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
def _predict_fold(self, fold_num, test_df): test_loader = factory.get_dataloader(test_df, self.cfg.data.test) model = factory.get_nn_model(self.cfg, is_train=False).to(device) model.load_state_dict(torch.load(f'../logs/{self.run_name}/weight_best_{fold_num}.pt')) all_preds = [] model.eval() with torch.no_grad(): for i, feats in enumerate(test_loader): if type(feats) == dict: for k, v in feats.items(): feats[k] = v.to(device) else: feats = feats.to(device) preds, _ = model(feats) preds = preds.sigmoid().cpu().detach().numpy() all_preds.append(preds) return np.concatenate(all_preds)
def train(self, train_df, target_df): oof = np.zeros((len(train_df), self.cfg.model.n_classes)) cv = 0 for fold_, col in enumerate(self.fold_df.columns): print( f'\n========================== FOLD {fold_} ... ==========================\n' ) logging.debug( f'\n========================== FOLD {fold_} ... ==========================\n' ) trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[ self.fold_df[col] > 0] val_y = target_df[self.fold_df[col] > 0].values train_loader = factory.get_dataloader(trn_x, self.cfg.data.train) valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid) model = factory.get_nn_model(self.cfg).to(device) criterion = factory.get_loss(self.cfg) optimizer = factory.get_optim(self.cfg, model.parameters()) scheduler = factory.get_scheduler(self.cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(self.cfg.model.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() model, avg_loss = self._train_epoch(model, train_loader, criterion, optimizer, mb) valid_preds, avg_val_loss = self._val_epoch( model, valid_loader, criterion) val_score = factory.get_metrics(self.cfg.common.metrics.name)( val_y, valid_preds) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if self.cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif self.cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) logging.debug( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s' ) if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if self.cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index, :] = best_valid_preds cv += best_val_score * self.fold_df[col].max() torch.save(best_model, f'../logs/{self.run_name}/weight_best_{fold_}.pt') self._save_loss_png(train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') logging.debug( f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') self.oof = oof.reshape(-1, 5) return cv
def train_model(run_name, df, fold_df, cfg): oof = np.zeros(len(df)) cv = 0 for fold_, col in enumerate(fold_df.columns): print(f'\n========================== FOLD {fold_} ... ==========================\n') logging.debug(f'\n========================== FOLD {fold_} ... ==========================\n') trn_x, val_x = df[fold_df[col] == 0], df[fold_df[col] > 0] val_y = val_x.loc[:33126][cfg.common.target] val_org_idx = np.where(val_x.index <= 33126)[0] train_loader = factory.get_dataloader(trn_x, cfg.data.train) valid_loader = factory.get_dataloader(val_x, cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) val_score = factory.get_metrics(cfg.common.metrics.name)(val_y, valid_preds[val_org_idx]) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index] = best_valid_preds.reshape(-1) cv += best_val_score * fold_df[col].max() torch.save(best_model, f'../logs/{run_name}/weight_best_{fold_}.pt') save_png(run_name, cfg, train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') logging.debug(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') result = { 'cv': cv, } np.save(f'../logs/{run_name}/oof.npy', oof) return result
def test_dataloader(self): return get_dataloader(self.cfg, 'test')
def val_dataloader(self): return get_dataloader(self.cfg, 'valid')
def train_dataloader(self): return get_dataloader(self.cfg, 'train')
def train(self, train_df, target_df): oof = np.zeros((len(train_df), self.cfg.model.n_classes)) cv = 0 for fold_, col in enumerate(self.fold_df.columns): print(f'\n========================== FOLD {fold_ + 1} / {self.n_splits} ... ==========================\n') logging.debug(f'\n========================== FOLD {fold_ + 1} / {self.n_splits} ... ==========================\n') trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[self.fold_df[col] > 0] val_y = target_df[self.fold_df[col] > 0].values if 'transformer' in self.cfg.model.backbone: usecols = ['user_id', 'content_id', 'task_container_id', 'timestamp', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'part', 'answered_correctly', 'te_content_id_by_answered_correctly', 'answered_correctly_avg_u'] group = (trn_x[usecols] .groupby('user_id') .apply(lambda r: (r['content_id'].values, r['answered_correctly'].values, r['timestamp'].values, r['prior_question_elapsed_time'].values, r['part'].values, r['te_content_id_by_answered_correctly'].values, r['task_container_id'].values))) train_loader = factory.get_transformer_dataloader(samples=group, df=trn_x, cfg=self.cfg.data.train) valid_loader = factory.get_transformer_dataloader(samples=group, df=val_x, cfg=self.cfg.data.valid) else: train_loader = factory.get_dataloader(trn_x, self.cfg.data.train) valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid) model = factory.get_nn_model(self.cfg).to(device) criterion = factory.get_loss(self.cfg) optimizer = factory.get_optim(self.cfg, model.parameters()) scheduler = factory.get_scheduler(self.cfg, optimizer) best_epoch = -1 best_val_score = -np.inf mb = master_bar(range(self.cfg.model.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] for epoch in mb: start_time = time.time() with detect_anomaly(): model, avg_loss = self._train_epoch(model, train_loader, criterion, optimizer, mb) valid_preds, avg_val_loss = self._val_epoch(model, valid_loader, criterion) val_score = factory.get_metrics(self.cfg.common.metrics.name)(val_y, valid_preds) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if self.cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif self.cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f} avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f} avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if self.cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() oof[val_x.index, :] = best_valid_preds cv += best_val_score * self.fold_df[col].max() torch.save(best_model, f'../logs/{self.run_name}/weight_best_{fold_}.pt') # self._save_loss_png(train_loss_list, val_loss_list, val_score_list, fold_) print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.6f}') logging.debug(f'\nEpoch {best_epoch} - val_score: {best_val_score:.6f}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') logging.debug(f'\n\nCV: {cv:.6f}') print('\n===================================\n\n') self.oof = oof return cv
def train_cnn(run_name, trn_x, val_x, trn_y, val_y, cfg): train_loader = factory.get_dataloader(trn_x, trn_y, cfg.data.train) valid_loader = factory.get_dataloader(val_x, val_y, cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_score = -np.inf best_coef = [] mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) if cfg.model.n_classes > 1: val_score = quadratic_weighted_kappa(val_y, valid_preds.argmax(1)) cm = confusion_matrix(val_y, valid_preds.argmax(1)) else: optR = QWKOptimizedRounder() optR.fit(valid_preds.copy(), val_y, initial_coef) coef = optR.coefficients() valid_preds_class = optR.predict(valid_preds.copy(), coef) val_score = quadratic_weighted_kappa(val_y, valid_preds_class) cm = confusion_matrix(val_y, valid_preds_class) # cm = np.round(cm / np.sum(cm, axis=1, keepdims=True), 3) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) val_score_list.append(val_score) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s') if val_score > best_val_score: best_epoch = epoch + 1 best_val_score = val_score best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() if cfg.model.n_classes == 1: best_coef = coef best_cm = cm print('\n\nCONFUSION MATRIX') logging.debug('\n\nCONFUSION MATRIX') print(cm) logging.debug(cm) print('\n\n===================================\n') print(f'CV: {best_val_score:.6f}') print(f'BEST EPOCH: {best_epoch}') logging.debug(f'\n\nCV: {best_val_score:.6f}') logging.debug(f'BEST EPOCH: {best_epoch}\n\n') print('\n===================================\n\n') result = { 'cv': best_val_score, } np.save(f'../logs/{run_name}/oof.npy', best_valid_preds) np.save(f'../logs/{run_name}/best_coef.npy', best_coef) torch.save(best_model, f'../logs/{run_name}/weight_best.pt') save_png(run_name, cfg, train_loss_list, val_loss_list, val_score_list) return result
def train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg): ordinal_val_preds = np.zeros_like(val_y) for i, col in enumerate(trn_y.columns[1:]): print(f'\n\n==================== {col} ====================') logging.debug(f'\n\n==================== {col} ====================') train_loader = factory.get_dataloader(trn_x, trn_y[col], cfg.data.train) valid_loader = factory.get_dataloader(val_x, val_y[col], cfg.data.valid) model = factory.get_model(cfg).to(device) criterion = factory.get_loss(cfg) optimizer = factory.get_optim(cfg, model.parameters()) scheduler = factory.get_scheduler(cfg, optimizer) best_epoch = -1 best_val_loss = np.inf mb = master_bar(range(cfg.data.train.epochs)) train_loss_list = [] val_loss_list = [] val_score_list = [] initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5] for epoch in mb: start_time = time.time() model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg) valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg) train_loss_list.append(avg_loss) val_loss_list.append(avg_val_loss) if cfg.scheduler.name != 'ReduceLROnPlateau': scheduler.step() elif cfg.scheduler.name == 'ReduceLROnPlateau': scheduler.step(avg_val_loss) elapsed = time.time() - start_time mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s') if avg_val_loss < best_val_loss: best_epoch = epoch + 1 best_val_loss = avg_val_loss best_valid_preds = valid_preds if cfg.model.multi_gpu: best_model = model.module.state_dict() else: best_model = model.state_dict() print(f'epoch: {best_epoch} loss: {best_val_loss}') ordinal_val_preds[:, i] = 1 / (1 + np.exp(-1 * best_valid_preds)) np.save(f'../logs/{run_name}/oof_{col}.npy', best_valid_preds) torch.save(best_model, f'../logs/{run_name}/weight_best_{col}.pt') valid_preds = np.sum(ordinal_val_preds, axis=1) val_y = (np.sum(val_y.values, axis=1) - 1).astype(int) optR = QWKOptimizedRounder() optR.fit(valid_preds.copy(), val_y, initial_coef) best_coef = optR.coefficients() valid_preds_class = optR.predict(valid_preds.copy(), best_coef) best_val_score = quadratic_weighted_kappa(val_y, valid_preds_class) cm = confusion_matrix(val_y, valid_preds_class) print('\n\nCONFUSION MATRIX') logging.debug('\n\nCONFUSION MATRIX') print(cm) logging.debug(cm) print('\n\n===================================\n') print(f'CV: {best_val_score:.6f}') logging.debug(f'\n\nCV: {best_val_score:.6f}') print('\n===================================\n\n') result = { 'cv': best_val_score, } np.save(f'../logs/{run_name}/best_coef.npy', best_coef) return result