def main(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--cfg", type=str, default='experiments/db_resnet50_s1.yml', help="config file") parser.add_argument("--start", type=int, default=0, help="start iter") parser.add_argument("--gpu", type=int, default=0, help="start iter") args, _ = parser.parse_known_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) cfg.merge_from_file(args.cfg) trainer = Trainer(cfg) trainer.train(args.start)
def main(args, **model_kwargs): device = torch.device(args.device) data = util.load_dataset(args.data, args.batch_size, args.batch_size, args.batch_size, n_obs=args.n_obs, fill_zeroes=args.fill_zeroes) scaler = data['scaler'] aptinit, supports = util.make_graph_inputs(args, device) model = GWNet.from_args(args, device, supports, aptinit, **model_kwargs) if args.checkpoint: model.load_checkpoint(torch.load(args.checkpoint)) model.to(device) engine = Trainer.from_args(model, scaler, args) metrics = [] best_model_save_path = os.path.join(args.save, 'best_model.pth') lowest_mae_yet = 100 # high value, will get overwritten mb = progress_bar(list(range(1, args.epochs + 1))) epochs_since_best_mae = 0 for _ in mb: train_loss, train_mape, train_rmse = [], [], [] data['train_loader'].shuffle() for iter, (x, y) in enumerate(data['train_loader'].get_iterator()): trainx = torch.Tensor(x).to(device).transpose(1, 3) trainy = torch.Tensor(y).to(device).transpose(1, 3) yspeed = trainy[:, 0, :, :] if yspeed.max() == 0: continue mae, mape, rmse = engine.train(trainx, yspeed) train_loss.append(mae) train_mape.append(mape) train_rmse.append(rmse) if args.n_iters is not None and iter >= args.n_iters: break engine.scheduler.step() _, valid_loss, valid_mape, valid_rmse = eval_(data['val_loader'], device, engine) m = dict(train_loss=np.mean(train_loss), train_mape=np.mean(train_mape), train_rmse=np.mean(train_rmse), valid_loss=np.mean(valid_loss), valid_mape=np.mean(valid_mape), valid_rmse=np.mean(valid_rmse)) m = pd.Series(m) metrics.append(m) if m.valid_loss < lowest_mae_yet: torch.save(engine.model.state_dict(), best_model_save_path) lowest_mae_yet = m.valid_loss epochs_since_best_mae = 0 else: epochs_since_best_mae += 1 met_df = pd.DataFrame(metrics) mb.comment = f'best val_loss: {met_df.valid_loss.min(): .3f}, current val_loss: {m.valid_loss:.3f}, current train loss: {m.train_loss: .3f}' met_df.round(6).to_csv(f'{args.save}/metrics.csv') if epochs_since_best_mae >= args.es_patience: break # Metrics on test data engine.model.load_state_dict(torch.load(best_model_save_path)) realy = torch.Tensor(data['y_test']).transpose(1, 3)[:, 0, :, :].to(device) test_met_df, yhat = calc_tstep_metrics(engine.model, device, data['test_loader'], scaler, realy, args.seq_length) test_met_df.round(6).to_csv(os.path.join(args.save, 'test_metrics.csv')) print(summary(args.save))
print(f"LOGGING EVERY {config.LOG_EVERY} EPOCHS\n") epochs = config.EPOCHS model.to(config.DEVICE) # initialie `Trainer` if resuming training if args['resume_training'] == 'yes': if args['model_path'] == None: sys.exit('\nPLEASE PROVIDE A MODEL TO RESUME TRAINING FROM!') trainer = Trainer(model, train_data_loader, train_dataset, valid_data_loader, valid_dataset, config.CLASSES_TO_TRAIN, epochs, config.DEVICE, config.LR, args['resume_training'], model_path=args['model_path']) # initialie `Trainer` if training from beginning else: trainer = Trainer(model, train_data_loader, train_dataset, valid_data_loader, valid_dataset, config.CLASSES_TO_TRAIN, epochs, config.DEVICE, config.LR, args['resume_training']) trained_epochs = trainer.get_num_epochs() epochs_to_train = epochs - trained_epochs
def make_trainer(model, args): return Trainer(model)
parser.add_argument("--pre_trained_disc", default=None) parser.add_argument("--pre_trained_gen", default=None) parser.add_argument("--dataset", default="flowers") parser.add_argument("--split", default=0, type=int) parser.add_argument("--batch_size", default=64, type=int) parser.add_argument("--num_workers", default=8, type=int) parser.add_argument("--epochs", default=200, type=int) args = parser.parse_args() trainer = Trainer( type=args.type, dataset=args.dataset, split=args.split, lr=args.lr, diter=args.diter, vis_screen=args.vis_screen, save_path=args.save_path, l1_coef=args.l1_coef, l2_coef=args.l2_coef, pre_trained_disc=args.pre_trained_disc, pre_trained_gen=args.pre_trained_gen, batch_size=args.batch_size, num_workers=args.num_workers, epochs=args.epochs, ) if not args.inference: trainer.train(args.cls) else: trainer.predict()
def main(args, **model_kwargs): # Train on subset of sensors (faster for isolated pred) # incl_sensors = list(range(207)) #[17, 111, 12, 80, 200] # args.num_sensors = len(incl_sensors) device = torch.device(args.device) data = util.load_dataset(args.data, args.batch_size, args.batch_size, args.batch_size, n_obs=args.n_obs, fill_zeroes=args.fill_zeroes) scaler = data['scaler'] supports = [] aptinit = 0 # aptinit, supports = util.make_graph_inputs(args, device) # Length of the prediction args.seq_length = data['y_val'].shape[1] args.num_sensors = data['x_val'].shape[2] if args.static: print('Selected static prediction') model = StaticNet.from_args(args, device, supports, aptinit, **model_kwargs) elif args.lstm: print('Selected LSTM-FC model') args.nhid = 256 args.weight_decay = 0.0005 args.learning_rate = 0.001 model = LSTMNet.from_args(args, device, supports, aptinit, **model_kwargs) else: print('Selected Graph Wavenet model') model = GWNet.from_args(args, device, supports, aptinit, **model_kwargs) print(args) if args.checkpoint: model.load_checkpoint(torch.load(args.checkpoint)) model.to(device) engine = Trainer.from_args(model, scaler, args) metrics = [] best_model_save_path = os.path.join(args.save, 'best_model.pth') lowest_mae_yet = 100 # high value, will get overwritten mb = progress_bar(list(range(1, args.epochs + 1))) epochs_since_best_mae = 0 ep_count = 1 for _ in mb: train_loss, train_mape, train_rmse = [], [], [] data['train_loader'].shuffle() for iter, (x, y) in enumerate(data['train_loader'].get_iterator()): trainx = torch.Tensor(x).to(device).transpose(1, 3) trainy = torch.Tensor(y).to(device).transpose(1, 3) # print (trainx.shape, trainy.shape) yspeed = trainy[:, 0, :, :] if yspeed.max() == 0: continue mae, mape, rmse = engine.train(trainx, yspeed) train_loss.append(mae) train_mape.append(mape) train_rmse.append(rmse) print('MAPE', mape) if iter % 10 == 5: # WARN: saves for every x iterations evaluate_multiple_horizon(engine.model, device, data, args.seq_length) if args.n_iters is not None and iter >= args.n_iters: break engine.scheduler.step() print('EPOCH', ep_count) ep_count += 1 _, valid_loss, valid_mape, valid_rmse = eval_(data['val_loader'], device, engine) m = dict(train_loss=np.mean(train_loss), train_mape=np.mean(train_mape), train_rmse=np.mean(train_rmse), valid_loss=np.mean(valid_loss), valid_mape=np.mean(valid_mape), valid_rmse=np.mean(valid_rmse)) m = pd.Series(m) metrics.append(m) if m.valid_loss < lowest_mae_yet: torch.save(engine.model.state_dict(), best_model_save_path) lowest_mae_yet = m.valid_loss epochs_since_best_mae = 0 else: epochs_since_best_mae += 1 met_df = pd.DataFrame(metrics) mb.comment = f'best val_loss: {met_df.valid_loss.min(): .3f}, current val_loss: {m.valid_loss:.3f}, current train loss: {m.train_loss: .3f}' met_df.round(6).to_csv(f'{args.save}/metrics.csv') if epochs_since_best_mae >= args.es_patience: break # Metrics on test data engine.model.load_state_dict(torch.load(best_model_save_path)) realy = torch.Tensor(data['y_test']).transpose(1, 3)[:, 0, :, :].to(device) test_met_df, yhat = calc_tstep_metrics(engine.model, device, data['test_loader'], scaler, realy, args.seq_length) test_met_df.round(6).to_csv(os.path.join(args.save, 'test_metrics.csv')) print(summary(args.save))