def main(): args = get_args() utils.cleanup_log_dir(args.log_dir) args_dict = vars(args) json.dump(args_dict, open(os.path.join(args.log_dir, "training_arguments.json"), "w"), indent=4) if args.cluster: ray.init(address="auto") else: ray.init() resources = "" for k, v in ray.cluster_resources().items(): resources += "{} {}, ".format(k, v) print(resources[:-2], flush=True) # 1. Define Train Vector of Envs train_envs_factory, action_space, obs_space = VecEnv.create_factory( vec_env_size=args.num_env_processes, log_dir=args.log_dir, env_fn=pybullet_train_env_factory, env_kwargs={ "env_id": args.env_id, "frame_skip": args.frame_skip, "frame_stack": args.frame_stack }) # 2. Define Test Vector of Envs (Optional) test_envs_factory, _, _ = VecEnv.create_factory( vec_env_size=args.num_env_processes, log_dir=args.log_dir, env_fn=pybullet_test_env_factory, env_kwargs={ "env_id": args.env_id, "frame_skip": args.frame_skip, "frame_stack": args.frame_stack }) # 3. Define RL training algorithm algo_factory = PPO.create_factory( lr=args.lr, eps=args.eps, num_epochs=args.ppo_epoch, clip_param=args.clip_param, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, num_mini_batch=args.num_mini_batch, use_clipped_value_loss=args.use_clipped_value_loss, gamma=args.gamma) # 4. Define RL Policy actor_factory = OnPolicyActor.create_factory( obs_space, action_space, feature_extractor_network=get_feature_extractor(args.nn), recurrent_policy=args.recurrent_policy, restart_model=args.restart_model) # 5. Define rollouts storage storage_factory = GAEBuffer.create_factory(size=args.num_steps, gae_lambda=args.gae_lambda) # 6. Define scheme params = {} # add core modules params.update({ "algo_factory": algo_factory, "actor_factory": actor_factory, "storage_factory": storage_factory, "train_envs_factory": train_envs_factory, "test_envs_factory": test_envs_factory, }) # add collection specs params.update({ "num_col_workers": args.num_col_workers, "col_communication": args.com_col_workers, "col_worker_resources": { "num_cpus": 1, "num_gpus": 0.5 }, "sync_col_specs": { "fraction_samples": 1.0, "fraction_workers": 1.0 } }) # add gradient specs params.update({ "num_grad_workers": args.num_grad_workers, "grad_communication": args.com_grad_workers, "grad_worker_resources": { "num_cpus": 1.0, "num_gpus": 0.5 }, }) scheme = Scheme(**params) # 7. Define learner learner = Learner(scheme, target_steps=args.num_env_steps, log_dir=args.log_dir) # 8. Define train loop iterations = 0 start_time = time.time() while not learner.done(): learner.step() if iterations % args.log_interval == 0: learner.print_info() if iterations % args.save_interval == 0: save_name = learner.save_model() args_dict.update({"latest_model": save_name}) args_path = os.path.join(args.log_dir, "training_arguments.json") json.dump(args_dict, open(args_path, "w"), indent=4) if args.max_time != -1 and (time.time() - start_time) > args.max_time: break iterations += 1 print("Finished!") sys.exit()
def main(): args = get_args() utils.cleanup_log_dir(args.log_dir) args_dict = vars(args) json.dump(args_dict, open(os.path.join(args.log_dir, "training_arguments.json"), "w"), indent=4) if args.cluster: ray.init(address="auto") else: ray.init() resources = "" for k, v in ray.cluster_resources().items(): resources += "{} {}, ".format(k, v) print(resources[:-2], flush=True) # 1. Define Train Vector of Envs train_envs_factory, action_space, obs_space = VecEnv.create_factory( env_fn=atari_train_env_factory, env_kwargs={ "env_id": args.env_id, "frame_stack": args.frame_stack }, vec_env_size=args.num_env_processes, log_dir=args.log_dir, info_keywords=('rr', 'rrr', 'lives')) # 2. Define Test Vector of Envs (Optional) test_envs_factory, _, _ = VecEnv.create_factory( env_fn=atari_test_env_factory, env_kwargs={ "env_id": args.env_id, "frame_stack": args.frame_stack }, vec_env_size=args.num_env_processes, log_dir=args.log_dir) # 3. Define RL training algorithm algo_factory = SAC.create_factory(lr_pi=args.lr, lr_q=args.lr, lr_alpha=args.lr, initial_alpha=args.alpha, gamma=args.gamma, polyak=args.polyak, num_updates=args.num_updates, update_every=args.update_every, start_steps=args.start_steps, mini_batch_size=args.mini_batch_size) # 4. Define RL Policy actor_factory = OffPolicyActor.create_factory( obs_space, action_space, feature_extractor_network=get_feature_extractor(args.nn), recurrent_policy=args.recurrent_policy, restart_model=args.restart_model) # 5. Define rollouts storage storage_factory = ReplayBuffer.create_factory(size=args.buffer_size) # 6. Define scheme params = {} # add core modules params.update({ "algo_factory": algo_factory, "actor_factory": actor_factory, "storage_factory": storage_factory, "train_envs_factory": train_envs_factory, "test_envs_factory": test_envs_factory, }) # add collection specs params.update({ "num_col_workers": args.num_col_workers, "col_communication": args.com_col_workers, "col_worker_resources": { "num_cpus": 1, "num_gpus": 0.5 }, "sync_col_specs": { "fraction_samples": 1.0, "fraction_workers": 1.0 } }) # add gradient specs params.update({ "num_grad_workers": args.num_grad_workers, "grad_communication": args.com_grad_workers, "grad_worker_resources": { "num_cpus": 1.0, "num_gpus": 0.5 }, }) scheme = Scheme(**params) # 7. Define learner learner = Learner(scheme, target_steps=args.num_env_steps, log_dir=args.log_dir) # 8. Define train loop iterations = 0 start_time = time.time() while not learner.done(): learner.step() if iterations % args.log_interval == 0: learner.print_info() if iterations % args.save_interval == 0: save_name = learner.save_model() args_dict.update({"latest_model": save_name}) args_path = os.path.join(args.log_dir, "training_arguments.json") json.dump(args_dict, open(args_path, "w"), indent=4) if args.max_time != -1 and (time.time() - start_time) > args.max_time: break iterations += 1 print("Finished!") sys.exit()