def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] goal = variant['goal'] expl_env = env_producer(domain, seed, goal) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print('------------------------------------------------') print('obs_dim', obs_dim) print('action_dim', action_dim) print('------------------------------------------------') # Get producer function for policy and value functions M = variant['layer_size'] q_producer = get_q_producer( obs_dim, action_dim, hidden_sizes=[1024, 1024, 1024, 1024, 1024, 1024, 1024]) policy_producer = get_policy_producer(obs_dim, action_dim, hidden_sizes=[M, M]) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, goal, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, q_producer, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) if prev_exp_state is not None: expl_path_collector.restore_from_snapshot( prev_exp_state['exploration']) ray.get([ remote_eval_path_collector.restore_from_snapshot.remote( prev_exp_state['evaluation_remote']) ]) ray.get([ remote_eval_path_collector.set_global_pkg_rng_state.remote( prev_exp_state['evaluation_remote_rng_state']) ]) replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer']) trainer.restore_from_snapshot(prev_exp_state['trainer']) set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state']) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def set_global_pkg_rng_state(self, state): set_global_pkg_rng_state(state)
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] num_parallel = variant['num_parallel'] custom_initialization = variant['custom_initialization'] expl_env = parallel_gibson_env_producer(num_env=num_parallel) #expl_env = parallel_gibson_stadium_env_producer(num_env=num_parallel) #obs_dim = expl_env.observation_space.low.size observation_space = expl_env.observation_space action_dim = expl_env.action_space.low.size # Get producer function for policy and value functions q_producer = get_q_producer(observation_space, action_dim, custom_initialization) policy_producer = get_policy_producer(observation_space, action_dim, custom_initialization) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, policy_producer, max_num_epoch_paths_saved=1) expl_path_collector = MdpPathCollector( expl_env, max_num_epoch_paths_saved=1, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_space=expl_env.observation_space, action_space=expl_env.action_space) trainer = SACTrainer(policy_producer, q_producer, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) if prev_exp_state is not None: expl_path_collector.restore_from_snapshot( prev_exp_state['exploration']) ray.get([ remote_eval_path_collector.restore_from_snapshot.remote( prev_exp_state['evaluation_remote']) ]) ray.get([ remote_eval_path_collector.set_global_pkg_rng_state.remote( prev_exp_state['evaluation_remote_rng_state']) ]) replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer']) trainer.restore_from_snapshot(prev_exp_state['trainer']) set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state']) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)
def experiment(variant, prev_exp_state=None): domain = variant['domain'] seed = variant['seed'] expl_env = env_producer(domain, seed) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size obs_dim, action_dim = { 'GridGoal1': (2, 2), 'GridGoal2': (2, 2), 'GridGoal3': (2, 2), 'AntEscape': (29, 8), 'AntJump': (29, 8), 'AntNavigate': (29, 8), 'HumanoidUp': (47, 17) }[domain] # Get producer function for policy and value functions M = variant['layer_size'] q_producer = get_q_producer(obs_dim, action_dim, hidden_sizes=[M, M]) policy_producer = get_policy_producer(obs_dim, action_dim, hidden_sizes=[M, M]) # Finished getting producer remote_eval_path_collector = RemoteMdpPathCollector.remote( domain, seed * 10 + 1, policy_producer) expl_path_collector = MdpPathCollector(expl_env, ) replay_buffer = ReplayBuffer(variant['replay_buffer_size'], ob_dim=obs_dim, ac_dim=action_dim) trainer = SACTrainer(policy_producer, q_producer, action_space=expl_env.action_space, **variant['trainer_kwargs']) algorithm = BatchRLAlgorithm( trainer=trainer, exploration_data_collector=expl_path_collector, remote_eval_data_collector=remote_eval_path_collector, replay_buffer=replay_buffer, optimistic_exp_hp=variant['optimistic_exp'], log_dir=variant['log_dir'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) if prev_exp_state is not None: expl_path_collector.restore_from_snapshot( prev_exp_state['exploration']) ray.get([ remote_eval_path_collector.restore_from_snapshot.remote( prev_exp_state['evaluation_remote']) ]) ray.get([ remote_eval_path_collector.set_global_pkg_rng_state.remote( prev_exp_state['evaluation_remote_rng_state']) ]) replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer']) trainer.restore_from_snapshot(prev_exp_state['trainer']) set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state']) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 algorithm.train(start_epoch)