def get_LR(model, trainloader, optimizer, criterion, device): print("########## Tweaked version from fastai ###########") lr_find = LRFinder(model, optimizer, criterion, device="cuda") lr_find.range_test(trainloader, end_lr=1, num_iter=100) lr_find.plot() # to inspect the loss-learning rate graph lr_find.reset() for index in range(len(lr_find.history['loss'])): item = lr_find.history['loss'][index] if item == lr_find.best_loss: min_val_index = index print(f"{min_val_index}") lr_find.plot(show_lr=lr_find.history['lr'][75]) lr_find.plot(show_lr=lr_find.history['lr'][min_val_index]) val_index = 75 mid_val_index = math.floor((val_index + min_val_index) / 2) show_lr = [{ 'data': lr_find.history['lr'][val_index], 'linestyle': 'dashed' }, { 'data': lr_find.history['lr'][mid_val_index], 'linestyle': 'solid' }, { 'data': lr_find.history['lr'][min_val_index], 'linestyle': 'dashed' }] # lr_find.plot_best_lr(skip_start=10, skip_end=5, log_lr=True, show_lr=show_lr, ax=None) best_lr = lr_find.history['lr'][mid_val_index] print(f"LR to be used: {best_lr}") return best_lr
def lr_finder(model, train_loader): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() optimizer_ft = optim.Adam(model.parameters(), lr=0.0000001) lr_finder = LRFinder(model, optimizer_ft, criterion, device=device) lr_finder.range_test(train_loader, end_lr=1, num_iter=1000) lr_finder.reset() lr_finder.plot()
def lr_finder(model, optimizer, criterion, trainloader): lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") lr_finder.plot() #to plot the loss vs Learning Rate curve lr_finder.reset() # to reset the lr_finder
def executeLr_finder(model, optimizer, device, trainloader, criterion): #finding and plotting the best LR lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") lr_finder.plot() # to inspect the loss-learning rate graph lr_finder.reset( ) # to reset the model and optimizer to their initial state
def lr_finder(net, optimizer, loss_fun, trainloader, testloader): # Using LRFinder lr_finder = LRFinder(net, optimizer, loss_fun, device='cuda') lr_finder.range_test(trainloader, val_loader=testloader, start_lr=1e-3, end_lr=0.1, num_iter=100, step_mode='exp') lr_finder.plot(log_lr=False) lr_finder.reset( ) # important to restore the model and optimizer's parameters to its initial state return lr_finder.history
def get_LR(model, trainloader, optimizer, criterion, device, testloader=None): # print("########## Tweaked version from fastai ###########") # lr_find = LRFinder(model, optimizer, criterion, device="cuda") # lr_find.range_test(trainloader, end_lr=100, num_iter=100) # best_lr=lr_find.plot() # to inspect the loss-learning rate graph # lr_find.reset() # return best_lr # print("########## Tweaked version from fastai ###########") # lr_find = LRFinder(model, optimizer, criterion, device="cuda") # lr_find.range_test(trainloader, end_lr=1, num_iter=100) # lr_find.plot() # to inspect the loss-learning rate graph # lr_find.reset() # for index in range(len(lr_find.history['loss'])): # item = lr_find.history['loss'][index] # if item == lr_find.best_loss: # min_val_index = index # print(f"{min_val_index}") # # lr_find.plot(show_lr=lr_find.history['lr'][75]) # lr_find.plot(show_lr=lr_find.history['lr'][min_val_index]) # # val_index = 75 # mid_val_index = math.floor((val_index + min_val_index)/2) # show_lr=[{'data': lr_find.history['lr'][val_index], 'linestyle': 'dashed'}, {'data': lr_find.history['lr'][mid_val_index], 'linestyle': 'solid'}, {'data': lr_find.history['lr'][min_val_index], 'linestyle': 'dashed'}] # # lr_find.plot_best_lr(skip_start=10, skip_end=5, log_lr=True, show_lr=show_lr, ax=None) # # best_lr = lr_find.history['lr'][mid_val_index] # print(f"LR to be used: {best_lr}") # # return best_lr print("########## Leslie Smith's approach ###########") lr_find = LRFinder(model, optimizer, criterion, device="cuda") lr_find.range_test(trainloader, val_loader=testloader, end_lr=1, num_iter=100, step_mode="linear") best_lr = lr_find.plot(log_lr=False) lr_find.reset() return best_lr
def main(args): np.random.seed(432) torch.random.manual_seed(432) try: os.makedirs(args.outpath) except OSError: pass experiment_path = utils.get_new_model_path(args.outpath) print(experiment_path) train_writer = SummaryWriter(os.path.join(experiment_path, 'train_logs')) val_writer = SummaryWriter(os.path.join(experiment_path, 'val_logs')) trainer = train.Trainer(train_writer, val_writer) # todo: add config train_transform = data.build_preprocessing() eval_transform = data.build_preprocessing() trainds, evalds = data.build_dataset(args.datadir, None) trainds.transform = train_transform evalds.transform = eval_transform model = models.resnet34() opt = torch.optim.Adam(model.parameters(), lr=1e-8) trainloader = DataLoader(trainds, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) evalloader = DataLoader(evalds, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True) #find lr fast ai criterion = torch.nn.BCEWithLogitsLoss() lr_finder = LRFinder(model, opt, criterion, device="cuda") # lr_finder.range_test(trainloader, val_loader=evalloader, end_lr=1, num_iter=10, step_mode="exp") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") #plot graph fast ai skip_start = 6 skip_end = 3 lrs = lr_finder.history["lr"] losses = lr_finder.history["loss"] grad_norm = lr_finder.history["grad_norm"] # ind = grad_norm.index(min(grad_norm)) # opt_lr = lrs[ind] # print('LR with min grad_norm =', opt_lr) lrs = lrs[skip_start:-skip_end] losses = losses[skip_start:-skip_end] fig = plt.figure(figsize=(12, 9)) plt.plot(lrs, losses) plt.xscale("log") plt.xlabel("Learning rate") plt.ylabel("Loss") train_writer.add_figure('loss_vs_lr', fig) lr_finder.reset() # fixed_lr = 1e-3 fixed_lr = 3e-4 opt = torch.optim.Adam(model.parameters(), lr=fixed_lr) # #new # lr = 1e-3 # eta_min = 1e-5 # t_max = 10 # opt = torch.optim.Adam(model.parameters(), lr=lr) # scheduler = CosineAnnealingLR(opt, T_max=t_max, eta_min=eta_min) # #new # one cycle for 5 ehoches # scheduler = CosineAnnealingLR(opt, 519*4, eta_min=1e-4) scheduler = CosineAnnealingLR(opt, args.epochs) # scheduler = CosineAnnealingLR(opt, 519, eta_min=1e-5) # scheduler = StepLR(opt, step_size=3, gamma=0.1) state_list = [] for epoch in range(args.epochs): # t = epoch / args.epochs # lr = np.exp((1 - t) * np.log(lr_begin) + t * np.log(lr_end)) # выставляем lr для всех параметров trainer.train_epoch(model, opt, trainloader, fixed_lr, scheduler) # trainer.train_epoch(model, opt, trainloader, 3e-4, scheduler) # trainer.train_epoch(model, opt, trainloader, 9.0451e-4, scheduler) metrics = trainer.eval_epoch(model, evalloader) state = dict( epoch=epoch, model_state_dict=model.state_dict(), optimizer_state_dict=opt.state_dict(), loss=metrics['loss'], lwlrap=metrics['lwlrap'], global_step=trainer.global_step, ) state_copy = copy.deepcopy(state) state_list.append(state_copy) export_path = os.path.join(experiment_path, 'last.pth') torch.save(state, export_path) # save the best path best_export_path = os.path.join(experiment_path, 'best.pth') max_lwlrap = 0 max_lwlrap_ind = 0 for i in range(args.epochs): if state_list[i]['lwlrap'] > max_lwlrap: max_lwlrap = state_list[i]['lwlrap'] max_lwlrap_ind = i best_state = state_list[max_lwlrap_ind] torch.save(best_state, best_export_path)
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, transform=get_transforms(data='valid')) train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = CustomModel(CFG.model_name, pretrained=False) model = torch.nn.DataParallel(model) model.load_state_dict( torch.load(f'{CFG.model_name}_student_fold{fold}_best_score.pth', map_location=torch.device('cpu'))['model']) # model.load_state_dict(torch.load(f'0.9647/{CFG.model_name}_no_hflip_fold{fold}_best_score.pth', map_location=torch.device('cpu'))['model']) model.to(device) # criterion = nn.BCEWithLogitsLoss() criterion = FocalLoss(alpha=1, gamma=6) # optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False) optimizer = SGD(model.parameters(), lr=1e-2, weight_decay=CFG.weight_decay, momentum=0.9) find_lr = False if find_lr: from lr_finder import LRFinder lr_finder = LRFinder(model, optimizer, criterion, device=device) lr_finder.range_test(train_loader, start_lr=1e-2, end_lr=1e0, num_iter=100, accumulation_steps=1) fig_name = f'{CFG.model_name}_lr_finder.png' lr_finder.plot(fig_name) lr_finder.reset() return scheduler = get_scheduler(optimizer) swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=1e-3) swa_start = 9 # ==================================================== # loop # ==================================================== best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device) # eval # avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) if epoch > swa_start: swa_model.update_parameters(model) swa_scheduler.step() else: if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) if score > best_score: best_score = score LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model') torch.save({'model': model.state_dict()}, OUTPUT_DIR + f'{CFG.model_name}_no_hflip_fold{fold}_best_score.pth') # if avg_val_loss < best_loss: # best_loss = avg_val_loss # LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') # torch.save({'model': model.state_dict(), # 'preds': preds}, # OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth') torch.optim.swa_utils.update_bn(train_loader, swa_model) avg_val_loss, preds, _ = valid_fn(valid_loader, swa_model, criterion, device) score, scores = get_score(valid_labels, preds) LOGGER.info(f'Save swa Score: {score:.4f} Model') torch.save({'model': swa_model.state_dict()}, OUTPUT_DIR + f'swa_{CFG.model_name}_fold{fold}_{score:.4f}.pth') # if CFG.nprocs != 8: # check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth') # for c in [f'pred_{c}' for c in CFG.target_cols]: # valid_folds[c] = np.nan # try: # valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] # except: # pass return
def main(args): # load train data into ram # data_path = '/mntlong/lanl_comp/data/' file_dir = os.path.dirname(__file__) data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data')) train_info_path = os.path.join(data_path, 'train_info.csv') train_data_path = os.path.join(data_path, 'train_compressed.npz') train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0') train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start'] train_signal = np.load(train_data_path)['signal'] train_quaketime = np.load(train_data_path)['quake_time'] # В валидацию берем 2 последних волны (части эксперимента) val_start_idx = train_info.iloc[-2, :]['indx_start'] val_signal = train_signal[val_start_idx:] val_quaketime = train_quaketime[val_start_idx:] train_signal = train_signal[:val_start_idx] train_quaketime = train_quaketime[:val_start_idx] # training params large_ws = 1500000 overlap_size = int(large_ws * 0.5) small_ws = 150000 num_bins = 17 cpc_meta_model = models.CPCv1(out_size=num_bins - 1) # logs_path = '/mntlong/scripts/logs/' logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs')) current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S') log_writer_path = os.path.join(logs_path, 'runs', current_datetime + '_' + args.model_name) train_dataset = data.SignalCPCDataset( train_signal, train_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, large_ws=large_ws, overlap_size=overlap_size, small_ws=small_ws) val_dataset = data.SignalCPCDataset( val_signal, val_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, large_ws=large_ws, overlap_size=overlap_size, small_ws=small_ws) print('x_t size:', train_dataset[0][0].size()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=5, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=5, pin_memory=True) if args.find_lr: from lr_finder import LRFinder optimizer = optim.Adam(cpc_meta_model.parameters(), lr=1e-6) lr_find = LRFinder(cpc_meta_model, optimizer, criterion=None, is_cpc=True, device='cuda') lr_find.range_test(train_loader, end_lr=2, num_iter=75, step_mode='exp') best_lr = lr_find.get_best_lr() lr_find.plot() lr_find.reset() print('best lr found: {:.2e}'.format(best_lr)) else: best_lr = 3e-4 # sys.exit() # model_path = os.path.join(logs_path, 'cpc_no_target_head_cont_last_state.pth') # cpc_meta_model.load_state_dict(torch.load(model_path)['model_state_dict']) # cpc_meta_model.to(torch.device('cuda')) optimizer = optim.Adam(cpc_meta_model.parameters(), lr=best_lr) # optimizer.load_state_dict(torch.load(model_path)['optimizer_state_dict']) lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, threshold=0.005) log_writer = SummaryWriter(log_writer_path) utils.train_cpc_model(cpc_meta_model=cpc_meta_model, optimizer=optimizer, num_bins=num_bins, lr_scheduler=lr_sched, train_loader=train_loader, val_loader=val_loader, num_epochs=args.num_epochs, model_name=args.model_name, logs_path=logs_path, log_writer=log_writer)
def main(args): # load train data into ram # data_path = '/mntlong/lanl_comp/data/' file_dir = os.path.dirname(__file__) data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data')) train_info_path = os.path.join(data_path, 'train_info.csv') train_data_path = os.path.join(data_path, 'train_compressed.npz') train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0') train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start'] train_signal = np.load(train_data_path)['signal'] train_quaketime = np.load(train_data_path)['quake_time'] # В валидацию берем 2 последних волны (части эксперимента) val_start_idx = train_info.iloc[-2, :]['indx_start'] val_signal = train_signal[val_start_idx:] val_quaketime = train_quaketime[val_start_idx:] train_signal = train_signal[:val_start_idx] train_quaketime = train_quaketime[:val_start_idx] # training params window_size = 150000 overlap_size = int(window_size * 0.5) num_bins = 17 model = models.BaselineNetRawSignalCnnRnnV1(out_size=num_bins-1) loss_fn = nn.CrossEntropyLoss() # L1Loss() SmoothL1Loss() MSELoss() # logs_path = '/mntlong/scripts/logs/' logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs')) current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S') log_writer_path = os.path.join(logs_path, 'runs', current_datetime + '_' + args.model_name) train_dataset = data.SignalDataset(train_signal, train_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, window_size=window_size, overlap_size=overlap_size) val_dataset = data.SignalDataset(val_signal, val_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, window_size=window_size, overlap_size=overlap_size) print('wave size:', train_dataset[0][0].size()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=5, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=5, pin_memory=True) if args.find_lr: from lr_finder import LRFinder optimizer = optim.Adam(model.parameters(), lr=1e-6) lr_find = LRFinder(model, optimizer, loss_fn, device='cuda') lr_find.range_test(train_loader, end_lr=1, num_iter=50, step_mode='exp') best_lr = lr_find.get_best_lr() lr_find.plot() lr_find.reset() print('best lr found: {:.2e}'.format(best_lr)) else: best_lr = 3e-4 optimizer = optim.Adam(model.parameters(), lr=best_lr) # weight_decay=0.1 lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, threshold=0.005) log_writer = SummaryWriter(log_writer_path) utils.train_clf_model(model=model, optimizer=optimizer, lr_scheduler=lr_sched, train_loader=train_loader, val_loader=val_loader, num_epochs=args.num_epochs, model_name=args.model_name, logs_path=logs_path, log_writer=log_writer, loss_fn=loss_fn, num_bins=num_bins)
from lr_finder import LRFinder from src.model_lib.MultiFTNet import MultiFTNet from src.model_lib.MiniFASNet import MiniFASNetV1, MiniFASNetV2,MiniFASNetV1SE,MiniFASNetV2SE from src.utility import get_kernel from torch.nn import CrossEntropyLoss, MSELoss from torch import optim from src.data_io.dataset_loader import get_train_loader,get_eval_loader from src.default_config import get_default_config, update_config from train import parse_args import os os.environ["CUDA_VISIBLE_DEVICES"] = "3" kernel_size = get_kernel(80, 60) model = MultiFTNet(conv6_kernel = kernel_size) cls_criterion = CrossEntropyLoss() FT_criterion = MSELoss() from torch import optim # optimizer = optim.SGD(model.parameters(), # lr=0.1, # weight_decay=5e-4, # momentum=0.9) optimizer = optim.AdamW(model.parameters()) lr_finder = LRFinder(model, optimizer, cls_criterion,FT_criterion) conf = get_default_config() args = parse_args() conf = update_config(args, conf) trainloader = get_train_loader(conf) val_loader = get_eval_loader(conf) lr_finder.range_test(trainloader, end_lr=1, num_iter=100, step_mode="linear") lr_finder.plot(log_lr=False) lr_finder.reset()