def main(): os.makedirs(args.output_dir, exist_ok=True) print(args) if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root') cfg = voc start_time = time.time() config = {"args": args, "num_workers": args.num_workers, "cfg": cfg} trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, training_operator_cls=SegOperator, use_tqdm=True, use_fp16=False, num_workers=config["num_workers"], config=config, use_gpu=torch.cuda.is_available()) for epoch in range(args.epochs): trainer.train() state_dict = trainer.state_dict() state_dict.update(epoch=epoch, args=args) torch.save(state_dict, os.path.join(args.output_dir, "model_{}.pth".format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(args): os.makedirs(args.output_dir, exist_ok=True) print(args) start_time = time.time() config = {"args": args, "num_workers": args.num_workers} trainer = TorchTrainer(training_operator_cls=SegOperator, use_tqdm=True, use_fp16=True, num_workers=config["num_workers"], config=config, use_gpu=torch.cuda.is_available()) for epoch in range(args.epochs): trainer.train() confmat = trainer.validate(reduce_results=False)[0] print(confmat) state_dict = trainer.state_dict() state_dict.update(epoch=epoch, args=args) torch.save(state_dict, os.path.join(args.output_dir, f"model_{epoch}.pth")) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Training time {total_time_str}")
num_workers=1, use_gpu=True, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn }, backend="auto", scheduler_step_freq="epoch") for i in range(100): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), "checkpoint.pt") torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt") m = trainer1.get_model() print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})") # trainer1.shutdown() print("success!") else: m = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) checkpoint = torch.load("invariant_checkpoint.pt", torch.device("cpu")) m.load_state_dict(checkpoint) # trained weight: [[0.0018693 0.05228069]], bias: [-0.5533147] , train_loss = 0.0 # trained weight: [[-0.01369903 0.03511396]], bias: [-0.6535952] , train_loss = 0.0 # trained weight: [[0.00687088 0.26634103]], bias: [-0.6658108] , train_loss = 0.0 # trained weight: torch.tensor([[0.038166143000125885,0.16197167336940765]]), bias: torch.tensor([-2.3122551]) # %% m.cpu()
config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn "path_invariant": path_invariant, # the path to the invariant network }, backend="auto", scheduler_step_freq="epoch") for i in range(50): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), os.path.join(utils.get_save_dir(), "checkpoint.pt")) torch.save(trainer1.get_model()[0].state_dict(), os.path.join(utils.get_save_dir(), "retrained_agent.pt")) agent_model, invariant_model = trainer1.get_model() else: sequential_nn = convert_ray_policy_to_sequential(policy).cpu() sequential_nn.load_state_dict( torch.load(os.path.join(utils.get_save_dir(), "retrained_agent.pt"))) agent_model = sequential_nn invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) invariant_model.load_state_dict(