def train(model_dir=None, params='default', data_dir='data', epochs=15, batch_size=500, retrain=None, train_steps=None, test_steps=None, debug_mode=False): """ :param model_dir: куда сохранять результаты обучения (при debug_mode=False) if None, model_dir = date_time :param params: dict with train and feature params. if params == 'default' take params from params.py :param data_dir: dir with: npy/ , data.csv :param retrain: path/to/model.pt that we need to re-train :param train_steps: сколько батчей прогонять в каждой эпохи if None, all batches :param test_steps: сколько тестовых батчей прогонять после каждой эпохи if None, all Test Set :param debug_mode: if True, without save model, summary and logs """ # get train params if params == 'default': params = parametres # see params.py if not debug_mode: if not model_dir: # create model_dir model_dir = datetime.now().strftime("%b%d-%H:%M_run") if retrain: model_dir = model_dir.replace('run', 'retrain') os.makedirs(os.path.join(model_dir, 'saves')) print('Model will store in: {}'.format(model_dir), flush=True) # -model_dir/saves # -model_dir/train.log # -model_dir/test.log # -model_dir/test.csv # -model_dir/train.csv else: print('Debug mode. No saves and no logs') # logging if not debug_mode: logfile = os.path.join(model_dir, 'train.log') print('\nTrain logs to: {}\n'.format(log_file), flush=True) else: logfile = None # logs to console if logging.getLogger().hasHandlers(): # if already logger exists change_logger(logging, logfile) else: logging.basicConfig(filename=logfile, format="%(message)s", level=logging.INFO) # info about parametres logging.info('Parametres:\n {}\n'.format(params)) # split train and test Sets logging.info('Split train and test Sets...') train_csv, test_csv = split_train_test(data_dir, model_dir) # load train data train = AudioDataset(train_csv, data_dir, params) input_shape = train.get_input_shape() logging.info('Input shape: {}\n'.format(input_shape)) sampler1 = BucketingSampler(train, batch_size) train = DataLoaderCuda(train, collate_fn=collate_audio, batch_sampler=sampler1) # load test data test = AudioDataset(test_csv, data_dir, params) sampler2 = BucketingSampler(test, batch_size) test = DataLoaderCuda(test, collate_fn=collate_audio, batch_sampler=sampler2) # init model model = model_init(params, train=True, model_path=retrain, use_cuda=True, logger=logging) # select optimizer if params['opt'] == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=params['weight_decay']) else: raise Exception('No optimizer: {}'.format(params['opt'])) # reduce learning rate every 2 epochs scheduler = StepLR(optimizer, step_size=params['lr_reduce_ep'], gamma=0.1) # summary writer if not debug_mode: log_dir = os.path.join(params['logdir'], model_dir) writer_train = SummaryWriter(log_dir=os.path.join(log_dir, 'train')) writer_test = SummaryWriter(log_dir=os.path.join(log_dir, 'test')) writer_train.add_graph(model, torch.rand(1, *input_shape)) logging.info('Logs for this model restored at {}'.format(log_dir)) else: writer_test = writer_train = None loss = torch.nn.CrossEntropyLoss() # n batches n_train = train_steps if train_step else len(train) n_test = test_steps if test_steps else len(test) k = round(n_train / n_test) # train and test step train_step, test_step = 0, 0 # best_metric init best_loss = 1000 for ep in range(1, epochs + 1): logging.info('\n-------------- {} epoch --------------'.format(ep)) print('{}/{} Epoch...'.format(ep, epochs)) model.train() train.shuffle(ep) for i, (x, target) in enumerate(train): optimizer.zero_grad() # обнуление предыдущих градиентов logits, probs = model(x) # logits - before activation (for loss) # probs - after activation (for acc) # CrossEntropy loss output = loss(logits, target) # is graph (for backward) loss_value = output.item() # is float32 # in case of learning crash if tensor.isnan(loss_value) or math_isnan(loss_value): message = 'Loss is nan on {} train step. Learning crash!'.format( train_step) logging.info(message) print(message) return # accuracy acc_value = accuracy(probs, target) # summary if not debug_mode and train_step % k == 0: writer_train.add_scalar('Loss/steps', loss_value, train_step) writer_train.add_scalar('Accuracy/steps', acc_value, train_step) # обратное распр-е ошибки. # для каждого параметра модели w считает w.grad # здесь НЕ обновляются веса! output.backward() clip_grad_norm_(model.parameters(), params['grad_norm']) # prevent exploding gradient # здесь обновление весов # w_new = w_old - lr * w.grad optimizer.step() logging.info('| Epoch {}: {}/{} | Loss {:.3f} | Acc {:.2f}'.format( ep, i + 1, n_train, loss_value, acc_value)) train_step += 1 # interrupt if train_steps and i + 1 == train_steps: break scheduler.step() new_lr = float(optimizer.param_groups[0]['lr']) logging.info('Updated learning rate: {}'.format(new_lr)) # saving # model_dir/saves/ep_1.pt save_name = os.path.join(model_dir, 'saves', 'ep_{}.pt'.format(ep)) if not debug_mode: save_weights(model, save_name, train_step) logging.info('\n------------- Test ---------------') # test logger setup if not debug_mode: test_logfile = os.path.join(model_dir, 'test.log') change_logger(logging, test_logfile) logging.info('Test results to: {}'.format(test_logfile)) avg_metrics = test(model=model, model_path=save_name, params=params, data_test=test, data_dir=data_dir, test_csv=test_csv, writer=writer_test, step=test_step, batch_size=batch_size, total_steps=test_steps, use_tb=not debug_mode, logfile=logging) message = '' for k, v in avg_metrics: message += '{}: {}\n'.format(k, v) # check whether it's the best metrics if avg_metrics['loss'] < best_loss: best_loss = avg_metrics['loss'] message = 'New best results' logging.info(message) print(message) if not debug_mode: writer_train.close() writer_test.close()
def test(model=None, model_path=None, params='default', data_test=None, data_dir='data', test_csv='data/data_test.csv', writer=None, step=0, batch_size=50, steps=None, use_tb=False, logger=False): """ run model on the Test Dataset :param model: torch model. If None it will be init from models.py :param params: dict with all required params. if 'default' it will be load from params.py :param data_test: torch DataLoader. if None it will be load from test_csv :param model_path: path/to/model.pt. if None it will be load from params['restore'] :param data_dir: path/to/npy/ :param test_csv: path/to/data_test.csv :param step: с какого шага ввести SummaryWriter :param steps: how many test batches to calc :param use_tb: save summary graph of not :param logger: True: print to model_path.log False: print to console logger object """ if params == 'default': params = parametres if model_path is None: model_path = params['restore'] assert model_path is not None, 'if default params used, model .pt must be defined' # logging if isinstance(logger, bool): if logger: # ../saves/ep_20_test.log logfile = model_name + '_test.log' print('\nlogs to: {}\n'.format(log_file)) else: logfile = None logging.basicConfig(filename=logfile, format="%(message)s", level=logging.INFO) logger = logging.getLogger elif isinstance(logger, module): # logger already exists pass else: raise Exception('logger must be bool or module') # log info about data logger.info('Info about data: {} \n -data_dir: {}\n'.format( test_csv, data_dir)) # log info about features logger.info('Info about features:\n -{}'.format()) # test data if data_test: test = data_test else: test = AudioDataset(test_csv, data_dir) sampler = BucketingSampler(test, batch_size=batch_size) test = DataLoaderCuda(test, collate_fn=collate_audio, batch_sampler=sampler) # summary writer if use_tb: if writer is None: summary_dir = os.path.join('dev/test_logs', model_name) logging.info('Summary writer: {}\n'.format(summary_dir)) writer = SummaryWriter(log_dir=summary_dir, purge_step=step) close_tb = True else: # writer is already exists close_tb = False # init if model is None: model = model_init(params, model_path=model_path, train=True, use_cuda=True, logger=logging) loss = torch.nn.CrossEntropyLoss() n_test = len(test) test.shuffle(43) sum_loss = 0 metrics = Metrics(acc=True, another_metrics=False) for x, target in test: with torch.no_grad(): logits, probs = model(x) # loss loss_value = loss(logits, target).item() sum_loss += loss_value metrics(probs, target) # summary if plot: writer.add_scalar('Loss/steps', loss_value, step) writer.add_scalar('Accuracy/steps', acc_value, step) logger.info('{}/{}: Test loss {:.3f} | Test acc {:.2f}'.format( step + 1, n_test, loss_value, acc_value)) # interrupt if total_steps and step + 1 == total_steps: break step += 1 # get average metrics avg_loss = sum_loss / n_test avg_metrics = metrics.get_avg() avg_metrics['loss'] = avg_loss message = '' for k, v in avg_metrics: message += '{}: {}\n'.format(k, v) # Summary logger.info('{:-^10}'.format('Average Metrics')) logger.info(message) if use_tb and close_tb: writer.close() return avg_metrics