def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.DEBUG) # stream_handler.setFormatter(handler_format) # logger.addHandler(stream_handler) print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) resume_fold = -1 if param['resume'] is not None: print('--- resume ---') info = torch.load(os.path.join(param['resume'], 'info.pth')) now_date = info['now_date'] resume_fold = info['fold'] param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in param['fold']: # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler( os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # Dataset param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) if param['debug']: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid != @fold').iloc[:param['batch size'] * 12], augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid == @fold').iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') else: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid != @fold'), augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train', margin_augmentation=True) valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid', margin_augmentation=False) logger.debug('train dataset size: {}'.format(len(train_dataset))) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('train loader size: {}'.format(len(train_dataloader))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) # model model = InceptionV4GRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout']) param['model'] = model.__class__.__name__ # optim optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=False) # scheduler model = model.to(param['device']) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if param['GPU'] > 0: model = nn.DataParallel(model) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format( EXP_NO, now_date, fold)) for key, val in param.items(): writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) max_char_acc = -1e-5 max_3char_acc = -1e-5 min_loss = 1e+5 snapshot = 0 snapshot_loss_list = list() snapshot_eval_list = list() snapshot_eval3_list = list() snapshot_loss = 1e+5 snapshot_eval = -1e-5 snapshot_eval3 = -1e-5 val_iter = math.ceil(len(train_dataloader) / 3) print('val_iter: {}'.format(val_iter)) # Hyper params cycle_iter = 5 snap_start = 2 n_snap = 8 mb = master_bar(range((n_snap + snap_start) * cycle_iter)) scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=len(train_dataloader) * cycle_iter, T_mult=1, T_up=500, eta_max=0.1) resume = False if resume_fold == fold: logger.debug('########################') logger.debug('## RESUME ##') logger.debug('########################') resume = True resume_epoch = info['epoch'] snapshot = info['snapshot'] min_loss = info['min_loss'] max_char_acc = info['max_char_acc'] max_3char_acc = info['max_char_acc'] snapshot_loss = info['snapshot_loss'] snapshot_eval = info['snapshot_eval'] snapshot_eval3 = info['snapshot_eval3'] logger.debug(f'epoch : {resume_epoch}') logger.debug(f'snapshot : {snapshot}') logger.debug('########################') model.load_state_dict( torch.load(os.path.join(outdir, 'latest.pth'))) for epoch in mb: if resume and epoch <= resume_epoch: if epoch == resume_epoch: print('set scheduler state') scheduler.step(epoch * len(train_dataloader)) print(f'lr : {scheduler.get_lr()}') continue if epoch % cycle_iter == 0 and epoch >= snap_start * cycle_iter: if snapshot > 1: snapshot_loss_list.append(snapshot_loss) snapshot_eval_list.append(snapshot_eval) snapshot_eval3_list.append(snapshot_eval3) snapshot += 1 snapshot_loss = 10**5 snapshot_eval = 0.0 snapshot_eval3 = 0.0 model.train() avg_train_loss = 10**5 avg_train_accuracy = 0.0 avg_three_train_acc = 0.0 for step, (inputs, targets, indice) in enumerate( progress_bar(train_dataloader, parent=mb)): model.train() inputs = inputs.to(param['device']) targets = targets.to(param['device']) optimizer.zero_grad() logits = model(inputs) # logits.size() = (batch*3, 48) preds = logits.view(targets.size(0), 3, -1).softmax(dim=2) loss = loss_fn(logits, targets.view(-1, targets.size(2)).argmax(dim=1)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() avg_train_loss += loss.item() _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item() avg_train_accuracy += _avg_accuracy _three_char_accuracy = accuracy_three_character( preds, targets.argmax(dim=2), mean=True).item() avg_three_train_acc += _three_char_accuracy writer.add_scalar("data/learning rate", scheduler.get_lr()[0], step + epoch * len(train_dataloader)) scheduler.step() writer.add_scalars( "data/metric/train", { 'loss': loss.item(), 'accuracy': _avg_accuracy, '3accuracy': _three_char_accuracy }, step + epoch * len(train_dataloader)) if step % val_iter == 0 and step != 0: avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug( '======================== epoch {} | step {} ========================' .format(epoch + 1, step + 1)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : test={:.5f}'.format(avg_valid_loss)) logger.debug('acc(per 1 char) : test={:.3%}'.format( avg_valid_accuracy)) logger.debug('acc(per 3 char) : test={:.3%}'.format( avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug( 'update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'. format(max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'. format(max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if snapshot > 0: if snapshot_loss > avg_valid_loss: logger.debug( '[snap] update best loss: {:.5f} ---> {:.5f}'. format(snapshot_loss, avg_valid_loss)) snapshot_loss = avg_valid_loss torch.save( model.state_dict(), os.path.join(outdir, f'best_loss_{snapshot}.pth')) if snapshot_eval < avg_valid_accuracy: logger.debug( '[snap] update best acc per 1 char: {:.3%} ---> {:.3%}' .format(snapshot_eval, avg_valid_accuracy)) snapshot_eval = avg_valid_accuracy torch.save( model.state_dict(), os.path.join(outdir, f'best_acc_{snapshot}.pth')) if snapshot_eval3 < avg_three_valid_acc: logger.debug( '[snap] update best acc per 3 char: {:.3%} ---> {:.3%}' .format(snapshot_eval3, avg_three_valid_acc)) snapshot_eval3 = avg_three_valid_acc torch.save( model.state_dict(), os.path.join(outdir, f'best_3acc_{snapshot}.pth')) avg_train_loss /= len(train_dataloader) avg_train_accuracy /= len(train_dataloader) avg_three_train_acc /= len(train_dataloader) avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug( '======================== epoch {} ========================'. format(epoch + 1)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : train={:.5f} , test={:.5f}'.format( avg_train_loss, avg_valid_loss)) logger.debug( 'acc(per 1 char) : train={:.3%} , test={:.3%}'.format( avg_train_accuracy, avg_valid_accuracy)) logger.debug( 'acc(per 3 char) : train={:.3%} , test={:.3%}'.format( avg_three_train_acc, avg_three_valid_acc)) if epoch == cycle_iter * snap_start: torch.save( model.state_dict(), os.path.join(outdir, f'model_epoch_{cycle_iter * snap_start}.pth')) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'.format( max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'.format( max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if snapshot > 0: if snapshot_loss > avg_valid_loss: logger.debug( '[snap] update best loss: {:.5f} ---> {:.5f}'.format( snapshot_loss, avg_valid_loss)) snapshot_loss = avg_valid_loss torch.save( model.state_dict(), os.path.join(outdir, f'best_loss_{snapshot}.pth')) if snapshot_eval < avg_valid_accuracy: logger.debug( '[snap] update best acc per 1 char: {:.3%} ---> {:.3%}' .format(snapshot_eval, avg_valid_accuracy)) snapshot_eval = avg_valid_accuracy torch.save( model.state_dict(), os.path.join(outdir, f'best_acc_{snapshot}.pth')) if snapshot_eval3 < avg_three_valid_acc: logger.debug( '[snap] update best acc per 3 char: {:.3%} ---> {:.3%}' .format(snapshot_eval3, avg_three_valid_acc)) snapshot_eval3 = avg_three_valid_acc torch.save( model.state_dict(), os.path.join(outdir, f'best_3acc_{snapshot}.pth')) torch.save(model.state_dict(), os.path.join(outdir, 'latest.pth')) torch.save( { 'now_date': now_date, 'epoch': epoch, 'fold': fold, 'snapshot': snapshot, 'min_loss': min_loss, 'max_char_acc': max_char_acc, 'max_3char_acc': max_3char_acc, 'snapshot_loss': snapshot_loss, 'snapshot_eval': snapshot_eval, 'snapshot_eval3': snapshot_eval3, }, os.path.join(outdir, 'info.pth')) snapshot_loss_list.append(snapshot_loss) snapshot_eval_list.append(snapshot_eval) snapshot_eval3_list.append(snapshot_eval3) writer.add_scalars( "data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() # Local cv target_list = list() for _, targets, _ in valid_dataloader: targets = targets.argmax(dim=2) target_list.append(targets) target_list = torch.cat(target_list) mb = master_bar(range(n_snap)) valid_logit_dict = dict() init = True for i in mb: model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, valid_dataloader, param['device'], valid_logit_dict, div=n_snap, init=init) init = False pred_list = torch.stack(list(valid_logit_dict.values())) pred_list = pred_list.softmax(dim=2) local_accuracy = accuracy_three_character(pred_list, target_list) logger.debug('LOCAL CV : {:5%}'.format(local_accuracy)) torch.save(valid_logit_dict, os.path.join(outdir, f'fold{fold}_valid_logit.pth')) local_cv['fold{}'.format(fold)] = { 'accuracy': local_accuracy, 'valid_size': len(valid_dataset) } del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer del valid_logit_dict, target_list gc.collect() logger.debug('=========== Prediction phrase ===========') if param['debug']: test_dataset = AlconDataset( df=get_test_df(param['tabledir']).iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') else: test_dataset = AlconDataset( df=get_test_df(param['tabledir']), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) test_logit_dict = dict() init = True for i in range(n_snap): logger.debug('load weight : {}'.format( os.path.join(outdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, test_dataloader, param['device'], test_logit_dict, div=n_snap, init=init) init = False torch.save(test_logit_dict, os.path.join(outdir, 'prediction.pth')) output_list = make_submission(test_logit_dict) pd.DataFrame(output_list).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() print('success!')
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.DEBUG) # stream_handler.setFormatter(handler_format) # logger.addHandler(stream_handler) print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in param['fold']: # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler( os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # Dataset param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) if param['debug']: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid != @fold').iloc[:param['batch size']], augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid == @fold').iloc[:param['batch size']], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') else: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid != @fold'), augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') logger.debug('train dataset size: {}'.format(len(train_dataset))) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('train loader size: {}'.format(len(train_dataloader))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) # model model = ResNetLSTM(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout']) param['model'] = model.__class__.__name__ cycle_iter = 5 # optim optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'], momentum=0.9, weight_decay=1e-5, nesterov=False) # scheduler scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=cycle_iter * len(train_dataloader), T_mult=1, eta_max=0.1, T_up=500) model = model.to(param['device']) if param['GPU'] > 0: model = nn.DataParallel(model) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format( EXP_NO, now_date, fold)) for key, val in param.items(): writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) mb = master_bar(range(param['epoch'])) cycle = 0 val_iter = -1 val_write_iter = 0 for epoch in mb: if epoch % cycle_iter == 0: cycle += 1 max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 # Train Step model.train() train_avg_loss = 0 train_avg_accuracy = 0 train_three_char_accuracy = 0 for step, (inputs, targets, indices) in enumerate( progress_bar(train_dataloader, parent=mb)): inputs = inputs.to(param['device']) targets = targets.to(param['device']) optimizer.zero_grad() logits = model(inputs) # logits.size() = (batch*3, 48) preds = logits.view(targets.size(0), 3, -1).softmax(dim=2) loss = loss_fn(logits, targets.view(-1, targets.size(2)).argmax(dim=1)) loss.backward() optimizer.step() train_avg_loss += loss.item() _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item() train_avg_accuracy += _avg_accuracy _three_char_accuracy = accuracy_three_character( preds, targets.argmax(dim=2), mean=True).item() train_three_char_accuracy += _three_char_accuracy writer.add_scalar("data/learning rate", scheduler.get_lr()[0], step + epoch * len(train_dataloader)) scheduler.step() writer.add_scalars( "data/metric/train", { 'loss': loss.item(), 'accuracy': _avg_accuracy, '3accuracy': _three_char_accuracy }, step + epoch * len(train_dataloader)) if val_iter != -1 and step % val_iter == 0 and step != 0: avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, val_write_iter) val_write_iter += 1 logger.debug( '============ epoch {} | iter {} | cycle {} ==========' .format(epoch + 1, step + 1, cycle)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : train={:.5f} , test={:.5f}'.format( train_avg_loss, avg_valid_loss)) logger.debug( 'acc(per 1 char) : train={:.3%} , test={:.3%}'.format( train_avg_accuracy, avg_valid_accuracy)) logger.debug( 'acc(per 3 char) : train={:.3%} , test={:.3%}'.format( train_three_char_accuracy, avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug( 'update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'. format(max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'. format(max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) train_avg_loss /= len(train_dataloader) train_avg_accuracy /= len(train_dataloader) train_three_char_accuracy /= len(train_dataloader) avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, val_write_iter) val_write_iter += 1 logger.debug( '============ epoch {} | iter {} | cycle {} =========='.format( epoch + 1, len(train_dataloader), cycle)) logger.debug('lr : {:.5f}'.format( scheduler.get_lr()[0])) logger.debug( 'loss : train={:.5f} , test={:.5f}'.format( train_avg_loss, avg_valid_loss)) logger.debug( 'acc(per 1 char) : train={:.3%} , test={:.3%}'.format( train_avg_accuracy, avg_valid_accuracy)) logger.debug( 'acc(per 3 char) : train={:.3%} , test={:.3%}'.format( train_three_char_accuracy, avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug( 'update best acc per 1 char: {:.3%} ---> {:.3%}'.format( max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug( 'update best acc per 3 char: {:.3%} ---> {:.3%}'.format( max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) writer.add_scalars( "data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() local_cv['fold{}'.format(fold)] = { 'accuracy': max_3char_acc, 'valid_size': len(valid_dataset) } del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer gc.collect() logger.debug('=========== Prediction phrase ===========') logger.debug('load weight : {}'.format( os.path.join(outdir, 'best_3acc.pth'))) model.load_state_dict(torch.load(os.path.join(outdir, 'best_3acc.pth'))) if param['debug']: test_dataset = AlconDataset( df=get_test_df(param['tabledir']).iloc[:param['batch size']], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') else: test_dataset = AlconDataset( df=get_test_df(param['tabledir']), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) output_list = pred_alcon_rnn(model, test_dataloader, param['device']) torch.save(output_list, os.path.join(outdir, 'prediction.pth')) pd.DataFrame(output_list).drop( 'logit', axis=1).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() # Ensemble print('======== Ensemble phase =========') prediction_dict = dict() mb = master_bar(param['fold']) print('======== Load Vector =========') for i, fold in enumerate(mb): outdir = os.path.join(param['save path'], EXP_NAME, now_date, 'fold{}'.format(fold)) prediction = torch.load(os.path.join(outdir, 'prediction.pth')) # prediction is list # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...} if i == 0: for preds in progress_bar(prediction, parent=mb): prediction_dict[preds['ID']] = preds['logit'] / len( param['fold']) else: for preds in progress_bar(prediction, parent=mb): prediction_dict[preds['ID']] += preds['logit'] / len( param['fold']) outdir = os.path.join(param['save path'], EXP_NAME, now_date) file_handler = logging.FileHandler(os.path.join(outdir, 'result.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.info(' ========== RESULT ========== \n') cv = 0.0 train_data_size = 0 for fold in param['fold']: acc = local_cv['fold{}'.format(fold)]['accuracy'] valid_size = local_cv['fold{}'.format(fold)]['valid_size'] train_data_size += valid_size logger.info(' fold {} : {:.3%} \n'.format(fold, acc)) cv += acc * valid_size logger.info(' Local CV : {:.3%} \n'.format(cv / train_data_size)) logger.info(' ============================== \n') logger.removeHandler(file_handler) torch.save(prediction_dict, os.path.join(outdir, 'prediction.pth')) print('======== make submittion file =========') vocab = get_vocab(param['vocabdir']) submit_list = list() for ID, logits in progress_bar(prediction_dict.items()): submit_dict = dict() submit_dict["ID"] = ID preds = logits.softmax(dim=1).argmax(dim=1) submit_dict["Unicode1"] = vocab['index2uni'][preds[0]] submit_dict["Unicode2"] = vocab['index2uni'][preds[1]] submit_dict["Unicode3"] = vocab['index2uni'][preds[2]] submit_list.append(submit_dict) print() pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) import zipfile with zipfile.ZipFile( os.path.join(outdir, 'submit_{}_{}.zip'.format(EXP_NAME, now_date)), 'w') as zf: zf.write(os.path.join(outdir, 'test_prediction.csv')) print('success!')
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() for fold in [0]: # change point # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = '/mnt/hdd1/alcon2019/exp9/2019-08-01_01-41-16/fold0/' file_handler = logging.FileHandler( os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) # Dataset param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) if param['debug']: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid != @fold').iloc[:param['batch size'] * 12], augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query( 'valid == @fold').iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') else: train_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid != @fold'), augmentation=get_train_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train', margin_augmentation=True) valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid', margin_augmentation=False) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) # change point # model model = InceptionV4GRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout']) param['model'] = model.__class__.__name__ # optim optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=False) # scheduler model = model.to(param['device']) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if param['GPU'] > 0: model = nn.DataParallel(model) loss_fn = nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy_one_character max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 max_char_acc = -1e-5 max_3char_acc = -1e-5 min_loss = 1e+5 snapshot = 0 snapshot_loss_list = list() snapshot_eval_list = list() snapshot_eval3_list = list() snapshot_loss = 1e+5 snapshot_eval = -1e-5 snapshot_eval3 = -1e-5 val_iter = math.ceil(len(train_dataloader) / 3) print('val_iter: {}'.format(val_iter)) # Hyper params cycle_iter = 5 snap_start = 2 n_snap = 8 mb = master_bar(range((n_snap + snap_start) * cycle_iter)) scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=len(train_dataloader) * cycle_iter, T_mult=1, T_up=500, eta_max=0.1) # local CV target_list = list() for _, targets, _ in valid_dataloader: targets = targets.argmax(dim=2) target_list.append(targets) target_list = torch.cat(target_list) mb = master_bar(range(n_snap)) valid_logit_dict = dict() init = True for i in mb: print('load weight : {}'.format( os.path.join(outdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, valid_dataloader, param['device'], valid_logit_dict, div=n_snap, init=init) init = False pred_list = torch.stack(list(valid_logit_dict.values())) pred_list = pred_list.softmax(dim=2) local_accuracy = accuracy_three_character(pred_list, target_list) logger.debug('LOCAL CV : {:5%}'.format(local_accuracy)) torch.save(valid_logit_dict, os.path.join(outdir, f'fold{fold}_valid_logit.pth')) local_cv['fold{}'.format(fold)] = { 'accuracy': local_accuracy, 'valid_size': len(valid_dataset) } del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer del valid_logit_dict, target_list gc.collect() logger.debug('=========== Prediction phrase ===========') if param['debug']: test_dataset = AlconDataset( df=get_test_df(param['tabledir']).iloc[:param['batch size'] * 12], augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') else: test_dataset = AlconDataset( df=get_test_df(param['tabledir']), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) test_logit_dict = dict() init = True for i in range(n_snap): print('load weight : {}'.format( os.path.join(outdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth'))) logit_alcon_rnn(model, test_dataloader, param['device'], test_logit_dict, div=n_snap, init=init) init = False torch.save(test_logit_dict, os.path.join(outdir, 'prediction.pth')) output_list = make_submission(test_logit_dict) pd.DataFrame(output_list).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() print('success!')
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) stream_handler.setFormatter(handler_format) logger.addHandler(stream_handler) # print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True for fold in param['fold']: # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN outdir = os.path.join(param['save path'], str(os.path.basename(__file__).split('.')[-2]) ,now_date, 'fold{}'.format(fold)) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler(os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= FOLD {} ============='.format(fold)) logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # Dataset train_dataset = AlconDataset(df=get_train_df().query('valid != @fold'), augmentation=get_train_augmentation(), datadir=os.path.join(param['dataroot'],'train','imgs'), mode='train') valid_dataset = AlconDataset(df=get_train_df().query('valid == @fold'), augmentation=get_test_augmentation(), datadir=os.path.join(param['dataroot'],'train','imgs'), mode='valid') logger.debug('train dataset size: {}'.format(len(train_dataset))) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('train loader size: {}'.format(len(train_dataloader))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) # model model = resnet18(pretrained=True) model.fc = nn.Linear(model.fc.in_features, 48) param['model'] = model.__class__.__name__ # optim if param['optim'].lower() == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'], momentum=0.9, weight_decay=1e-5, nesterov=False) elif param['optim'].lower() == 'adam': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr']) else: raise NotImplementedError # scheduler scheduler = eval(param['scheduler']) model = model.to(param['device']) loss_fn = torch.nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy max_char_acc = -1. max_3char_acc = -1. min_loss = 10**5 writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format(EXP_NO, now_date, fold)) for key, val in param.items(): writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) mb = master_bar(range(param['epoch'])) for epoch in mb: avg_train_loss, avg_train_accuracy, avg_three_train_acc = train_alcon(model, optimizer, train_dataloader, param['device'], loss_fn, eval_fn, epoch, scheduler=None, writer=writer, parent=mb) #ok avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon(model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars("data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) logger.debug('======================== epoch {} ========================'.format(epoch+1)) logger.debug('lr : {:.5f}'.format(scheduler.get_lr()[0])) logger.debug('loss : train={:.5f} , test={:.5f}'.format(avg_train_loss, avg_valid_loss)) logger.debug('acc(per 1 char) : train={:.3%} , test={:.3%}'.format(avg_train_accuracy, avg_valid_accuracy)) logger.debug('acc(per 3 char) : train={:.3%} , test={:.3%}'.format(avg_three_train_acc, avg_three_valid_acc)) if min_loss > avg_valid_loss: logger.debug('update best loss: {:.5f} ---> {:.5f}'.format(min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: logger.debug('update best acc per 1 char: {:.3%} ---> {:.3%}'.format(max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: logger.debug('update best acc per 3 char: {:.3%} ---> {:.3%}'.format(max_3char_acc , avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if 1: if scheduler is not None: if writer is not None: writer.add_scalar("data/learning rate", scheduler.get_lr()[0], epoch) scheduler.step() writer.add_scalars("data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) logger.debug('================ FINISH TRAIN ================') logger.debug('Result') logger.debug('Best loss : {}'.format(min_loss)) logger.debug('Best 1 acc : {}'.format(max_char_acc)) logger.debug('Best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer gc.collect() logger.debug('=========== Prediction phrase ===========') logger.debug('load weight : {}'.format(os.path.join(outdir, 'best_3acc.pth'))) model.load_state_dict(torch.load(os.path.join(outdir, 'best_3acc.pth'))) test_dataset = AlconDataset(df=get_test_df(), augmentation=get_test_augmentation(), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) logger.debug('test dataset size: {}'.format(len(test_dataset))) logger.debug('test loader size: {}'.format(len(test_dataloader))) output_list = pred_alcon(model, test_dataloader, param['device']) torch.save(output_list, os.path.join(outdir, 'prediction.pth')) pd.DataFrame(output_list).drop('logit', axis=1).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv')) logger.debug('success!') logger.removeHandler(file_handler) del test_dataset, test_dataloader gc.collect() # Ensemble print('======== Ensemble phase =========') prediction_dict = dict() mb = master_bar(param['fold']) print('======== Load Vector =========') for i, fold in enumerate(mb): outdir = os.path.join(param['save path'], str(os.path.basename(__file__).split('.')[-2]), now_date,'fold{}'.format(fold)) prediction = torch.load(os.path.join(outdir, 'prediction.pth')) # prediction is list # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...} if i == 0: for preds in progress_bar(prediction, parents=mb): prediction_dict[preds['ID']] = preds['logit'] / len(param['fold']) else: for preds in progress_bar(prediction, parents=mb): prediction_dict[preds['ID']] += preds['logit'] / len(param['fold']) print('======== make submittion file =========') vocab = get_vocab() submit_list = list() for ID, logits in progress_bar(prediction_dict.items()): submit_dict = dict() submit_dict["ID"] = ID preds = logits.softmax(dim=1).argmax(dim=1) submit_dict["Unicode1"] = vocab['index2uni'][preds[0]] submit_dict["Unicode2"] = vocab['index2uni'][preds[1]] submit_dict["Unicode3"] = vocab['index2uni'][preds[2]] submit_list.append(submit_dict) outdir = os.path.join(param['save path'], str(os.path.basename(__file__).split('.')[-2]), now_date) pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv')) import zipfile with zipfile.ZipFile('submit_{}_{}.zip'.format(str(os.path.basename(__file__).split('.')[-2]), now_date), 'w') as zf: zf.write(os.path.join(outdir, 'test_prediction.csv')) print('success!')
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/exp0.yaml', "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date # seed set seed_setting(param['seed']) if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True fold = param['fold'] outdir = os.path.join( param['save path'], str(os.path.basename(__file__).split('.')[-2]) + '_fold{}'.format(fold), now_date) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError # outdir = '../tmp' # Dataset train_dataset = AlconDataset(df=get_train_df().query('valid != @fold'), augmentation=get_train_augmentation(), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train') valid_dataset = AlconDataset(df=get_train_df().query('valid == @fold'), augmentation=get_test_augmentation(), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid') print('train dataset size: {}'.format(len(train_dataset))) print('valid dataset size: {}'.format(len(valid_dataset))) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) print('train loader size: {}'.format(len(train_dataloader))) print('valid loader size: {}'.format(len(valid_dataloader))) # model model = resnet18(pretrained=True) model.fc = nn.Linear(model.fc.in_features, 48) param['model'] = model.__class__.__name__ # optim if param['optim'].lower() == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'], momentum=0.9, weight_decay=1e-5, nesterov=False) elif param['optim'].lower() == 'adam': optimizer = torch.optim.SGD(model.parameters(), lr=param['lr']) else: raise NotImplementedError # scheduler scheduler = eval(param['scheduler']) model = model.to(param['device']) loss_fn = torch.nn.CrossEntropyLoss().to(param['device']) eval_fn = accuracy max_char_acc = 0. max_3char_acc = 0. min_loss = 10**5 writer = tbx.SummaryWriter("../log/exp0") for key, val in param.items(): # print(f'{key}: {val}') writer.add_text('data/hyperparam/{}'.format(key), str(val), 0) mb = master_bar(range(param['epoch'])) for epoch in mb: avg_train_loss, avg_train_accuracy, avg_three_train_acc = train_alcon( model, optimizer, train_dataloader, param['device'], loss_fn, eval_fn, epoch, scheduler=None, writer=writer, parent=mb) #ok avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon( model, valid_dataloader, param['device'], loss_fn, eval_fn) writer.add_scalars( "data/metric/valid", { 'loss': avg_valid_loss, 'accuracy': avg_valid_accuracy, '3accuracy': avg_three_valid_acc }, epoch) print('======================== epoch {} ========================'. format(epoch + 1)) print('lr : {:.5f}'.format(scheduler.get_lr()[0])) print('loss : train={:.5f} , test={:.5f}'.format( avg_train_loss, avg_valid_loss)) print('acc(per 1 char) : train={:.3%} , test={:.3%}'.format( avg_train_accuracy, avg_valid_accuracy)) print('acc(per 3 char) : train={:.3%} , test={:.3%}'.format( avg_three_train_acc, avg_three_valid_acc)) if min_loss > avg_valid_loss: print('update best loss: {:.5f} ---> {:.5f}'.format( min_loss, avg_valid_loss)) min_loss = avg_valid_loss torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth')) if max_char_acc < avg_valid_accuracy: print('update best acc per 1 char: {:.3%} ---> {:.3%}'.format( max_char_acc, avg_valid_accuracy)) max_char_acc = avg_valid_accuracy torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth')) if max_3char_acc < avg_three_valid_acc: print('update best acc per 3 char: {:.3%} ---> {:.3%}'.format( max_3char_acc, avg_three_valid_acc)) max_3char_acc = avg_three_valid_acc torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth')) if 1: if scheduler is not None: if writer is not None: writer.add_scalar("data/learning rate", scheduler.get_lr()[0], epoch) scheduler.step() writer.add_scalars( "data/metric/valid", { 'best loss': min_loss, 'best accuracy': max_char_acc, 'best 3accuracy': max_3char_acc }) print('finish train') print('result') print('best loss : {}'.format(min_loss)) print('best 1 acc : {}'.format(max_char_acc)) print('best 3 acc : {}'.format(max_3char_acc)) writer.export_scalars_to_json(os.path.join(outdir, 'history.json')) writer.close() #TODO: gc.collect() del train_dataset, valid_dataset del train_dataloader, valid_dataloader del scheduler, optimizer gc.collect() print('load weight') model.load_state_dict(torch.load(os.path.join(outdir, 'best_3acc.pth'))) test_dataset = AlconDataset(df=get_test_df(), augmentation=get_test_augmentation(), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False) print('test dataset size: {}'.format(len(test_dataset))) print('test loader size: {}'.format(len(test_dataloader))) output_list = pred_alcon(model, test_dataloader, param['device']) torch.save(output_list, os.path.join(outdir, 'prediction.pth')) pd.DataFrame(output_list).drop( 'logit', axis=1).sort_values('ID').set_index('ID').to_csv( os.path.join(outdir, 'submission.csv'))
def main(): now = datetime.datetime.now() now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second) # set logger logger = logging.getLogger("Log") logger.setLevel(logging.DEBUG) handler_format = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) with open('../params/prediction.yaml', "r+") as f: param = yaml.load(f, Loader=yaml.FullLoader) param['date'] = now_date EXP_NO = param['exp'] EXP_NAME = f'exp{EXP_NO}' outdir = os.path.join(param['save path'], EXP_NAME, now_date) if os.path.exists(param['save path']): os.makedirs(outdir, exist_ok=True) else: print("Not find {}".format(param['save path'])) raise FileNotFoundError file_handler = logging.FileHandler(os.path.join(outdir, 'experiment.log')) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(handler_format) logger.addHandler(file_handler) logger.debug('============= Prediction =============') logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)) # seed set if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True local_cv = dict() rootdirs = get_root_dir(EXP_NO) # mode = param['mode'] snap_range = param['snap_range'] param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU']) test_dataset = AlconDataset(df=get_test_df(param['tabledir']), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test') logger.debug('test dataset size: {}'.format(len(test_dataset))) test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('test loader size: {}'.format(len(test_dataloader))) sse = param['sse'] mode = param['mode'] if not mode in set(['loss', 'acc', '3acc']): raise "you choice mode in loss, acc, 3acc" for fold in range(5): logger.debug(f'========= FOLD : {fold} =========') rootdir = rootdirs[fold] ################# model = get_model(EXP_NO) optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=False) model = model.to(param['device']) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if param['GPU'] > 0: model = nn.DataParallel(model) ################## # Set Loader valid_dataset = AlconDataset( df=get_train_df(param['tabledir']).query('valid == @fold'), augmentation=get_test_augmentation( *get_resolution(param['resolution'])), datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid', margin_augmentation=False) valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'], pin_memory=False, drop_last=False, shuffle=False) logger.debug('valid dataset size: {}'.format(len(valid_dataset))) logger.debug('valid loader size: {}'.format(len(valid_dataloader))) target_list = list() for _, targets, _ in valid_dataloader: targets = targets.argmax(dim=2) target_list.append(targets) target_list = torch.cat(target_list) valid_logit_dict = dict() test_logit_dict = dict() init = True if sse: mb = master_bar(range(snap_range[0], snap_range[1])) n_div = len(range(snap_range[0], snap_range[1])) for i in mb: print('load weight: {}'.format( os.path.join(rootdir, f'best_{mode}_{i+1}.pth'))) # model.load_state_dict(torch.load(os.path.join(rootdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(rootdir, f'best_{mode}_{i+1}.pth'))) # model.load_state_dict(torch.load(os.path.join(rootdir, f'best_3acc_{i+1}.pth'))) logit_alcon_rnn(model, valid_dataloader, param['device'], valid_logit_dict, div=n_div, init=init) logit_alcon_rnn(model, test_dataloader, param['device'], test_logit_dict, div=n_div, init=init) init = False else: print('load weight: {}'.format( os.path.join(rootdir, f'best_{mode}.pth'))) # model.load_state_dict(torch.load(os.path.join(rootdir, f'best_loss_{i+1}.pth'))) model.load_state_dict( torch.load(os.path.join(rootdir, f'best_{mode}.pth'))) # model.load_state_dict(torch.load(os.path.join(rootdir, f'best_3acc_{i+1}.pth'))) logit_alcon_rnn(model, valid_dataloader, param['device'], valid_logit_dict, div=1, init=init) logit_alcon_rnn(model, test_dataloader, param['device'], test_logit_dict, div=1, init=init) init = False # calcurate local score pred_list = torch.stack(list(valid_logit_dict.values())) pred_list = pred_list.softmax(dim=2) local_accuracy = accuracy_three_character(pred_list, target_list) logger.debug('LOCAL CV : {:5%}'.format(local_accuracy)) torch.save(valid_logit_dict, os.path.join(outdir, f'fold{fold}_valid_logit.pth')) torch.save(test_logit_dict, os.path.join(outdir, f'fold{fold}_prediction.pth')) local_cv['fold{}'.format(fold)] = { 'accuracy': local_accuracy, 'valid_size': len(valid_dataset) } del valid_dataset, valid_dataloader, valid_logit_dict, test_logit_dict, pred_list del model, optimizer gc.collect() valid_logits = dict() test_logits = dict() for fold in range(5): path = os.path.join(outdir, f'fold{fold}_valid_logit.pth') logits = torch.load(path) for k, v in logits.items(): valid_logits[k] = v valid_logits = sorted(valid_logits.items()) valid_logits = dict((k, v) for k, v in valid_logits) torch.save(valid_logits, os.path.join(outdir, f'valid_logits.pth')) for fold in range(5): path = os.path.join(outdir, f'fold{fold}_prediction.pth') logits = torch.load(path) if fold == 0: for k, v in logits.items(): test_logits[k] = v / 5 else: for k, v in logits.items(): test_logits[k] += v / 5 torch.save(test_logits, os.path.join(outdir, 'test_logits.pth')) submission_to_df(make_submission(test_logits)).to_csv( os.path.join(outdir, 'test_prediction.csv')) print('success!')