def is_finite(s): try: f = float(s) if math_isnan(f) or math_abs(f) == INFINITY: return False return True except Exception: return False
def is_number(s): if s is True or s is False or s == None: return False try: s = float(s) return not math_isnan(s) except Exception: return False
def SUM(values): output = Null for v in values: if v == None: continue if isinstance(v, float) and math_isnan(v): continue if output == None: output = v continue output += v return output
def PRODUCT(values, *others): if len(others) > 0: from mo_logs import Log Log.error("no longer accepting args, use a single list") output = Null for v in values: if v == None: continue if isinstance(v, float) and math_isnan(v): continue if output == None: output = v continue output *= v return output
def is_nan(s): return s == None or math_isnan(s)
def train(model_dir=None, params='default', data_dir='data', epochs=15, batch_size=500, retrain=None, train_steps=None, test_steps=None, debug_mode=False): """ :param model_dir: куда сохранять результаты обучения (при debug_mode=False) if None, model_dir = date_time :param params: dict with train and feature params. if params == 'default' take params from params.py :param data_dir: dir with: npy/ , data.csv :param retrain: path/to/model.pt that we need to re-train :param train_steps: сколько батчей прогонять в каждой эпохи if None, all batches :param test_steps: сколько тестовых батчей прогонять после каждой эпохи if None, all Test Set :param debug_mode: if True, without save model, summary and logs """ # get train params if params == 'default': params = parametres # see params.py if not debug_mode: if not model_dir: # create model_dir model_dir = datetime.now().strftime("%b%d-%H:%M_run") if retrain: model_dir = model_dir.replace('run', 'retrain') os.makedirs(os.path.join(model_dir, 'saves')) print('Model will store in: {}'.format(model_dir), flush=True) # -model_dir/saves # -model_dir/train.log # -model_dir/test.log # -model_dir/test.csv # -model_dir/train.csv else: print('Debug mode. No saves and no logs') # logging if not debug_mode: logfile = os.path.join(model_dir, 'train.log') print('\nTrain logs to: {}\n'.format(log_file), flush=True) else: logfile = None # logs to console if logging.getLogger().hasHandlers(): # if already logger exists change_logger(logging, logfile) else: logging.basicConfig(filename=logfile, format="%(message)s", level=logging.INFO) # info about parametres logging.info('Parametres:\n {}\n'.format(params)) # split train and test Sets logging.info('Split train and test Sets...') train_csv, test_csv = split_train_test(data_dir, model_dir) # load train data train = AudioDataset(train_csv, data_dir, params) input_shape = train.get_input_shape() logging.info('Input shape: {}\n'.format(input_shape)) sampler1 = BucketingSampler(train, batch_size) train = DataLoaderCuda(train, collate_fn=collate_audio, batch_sampler=sampler1) # load test data test = AudioDataset(test_csv, data_dir, params) sampler2 = BucketingSampler(test, batch_size) test = DataLoaderCuda(test, collate_fn=collate_audio, batch_sampler=sampler2) # init model model = model_init(params, train=True, model_path=retrain, use_cuda=True, logger=logging) # select optimizer if params['opt'] == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=params['weight_decay']) else: raise Exception('No optimizer: {}'.format(params['opt'])) # reduce learning rate every 2 epochs scheduler = StepLR(optimizer, step_size=params['lr_reduce_ep'], gamma=0.1) # summary writer if not debug_mode: log_dir = os.path.join(params['logdir'], model_dir) writer_train = SummaryWriter(log_dir=os.path.join(log_dir, 'train')) writer_test = SummaryWriter(log_dir=os.path.join(log_dir, 'test')) writer_train.add_graph(model, torch.rand(1, *input_shape)) logging.info('Logs for this model restored at {}'.format(log_dir)) else: writer_test = writer_train = None loss = torch.nn.CrossEntropyLoss() # n batches n_train = train_steps if train_step else len(train) n_test = test_steps if test_steps else len(test) k = round(n_train / n_test) # train and test step train_step, test_step = 0, 0 # best_metric init best_loss = 1000 for ep in range(1, epochs + 1): logging.info('\n-------------- {} epoch --------------'.format(ep)) print('{}/{} Epoch...'.format(ep, epochs)) model.train() train.shuffle(ep) for i, (x, target) in enumerate(train): optimizer.zero_grad() # обнуление предыдущих градиентов logits, probs = model(x) # logits - before activation (for loss) # probs - after activation (for acc) # CrossEntropy loss output = loss(logits, target) # is graph (for backward) loss_value = output.item() # is float32 # in case of learning crash if tensor.isnan(loss_value) or math_isnan(loss_value): message = 'Loss is nan on {} train step. Learning crash!'.format( train_step) logging.info(message) print(message) return # accuracy acc_value = accuracy(probs, target) # summary if not debug_mode and train_step % k == 0: writer_train.add_scalar('Loss/steps', loss_value, train_step) writer_train.add_scalar('Accuracy/steps', acc_value, train_step) # обратное распр-е ошибки. # для каждого параметра модели w считает w.grad # здесь НЕ обновляются веса! output.backward() clip_grad_norm_(model.parameters(), params['grad_norm']) # prevent exploding gradient # здесь обновление весов # w_new = w_old - lr * w.grad optimizer.step() logging.info('| Epoch {}: {}/{} | Loss {:.3f} | Acc {:.2f}'.format( ep, i + 1, n_train, loss_value, acc_value)) train_step += 1 # interrupt if train_steps and i + 1 == train_steps: break scheduler.step() new_lr = float(optimizer.param_groups[0]['lr']) logging.info('Updated learning rate: {}'.format(new_lr)) # saving # model_dir/saves/ep_1.pt save_name = os.path.join(model_dir, 'saves', 'ep_{}.pt'.format(ep)) if not debug_mode: save_weights(model, save_name, train_step) logging.info('\n------------- Test ---------------') # test logger setup if not debug_mode: test_logfile = os.path.join(model_dir, 'test.log') change_logger(logging, test_logfile) logging.info('Test results to: {}'.format(test_logfile)) avg_metrics = test(model=model, model_path=save_name, params=params, data_test=test, data_dir=data_dir, test_csv=test_csv, writer=writer_test, step=test_step, batch_size=batch_size, total_steps=test_steps, use_tb=not debug_mode, logfile=logging) message = '' for k, v in avg_metrics: message += '{}: {}\n'.format(k, v) # check whether it's the best metrics if avg_metrics['loss'] < best_loss: best_loss = avg_metrics['loss'] message = 'New best results' logging.info(message) print(message) if not debug_mode: writer_train.close() writer_test.close()