def hierarchy_sac( variant, env_class, env_kwargs=None, observation_key="observation", ): # initialize tensorflow and the multiprocessing interface maybe_initialize_process() # run an experiment with multiple agents if env_kwargs is None: env_kwargs = {} # initialize the environment to track the cardinality of actions env = NormalizedEnv(env_class, **env_kwargs) action_dim = env.action_space.low.size observation_dim = env.observation_space.spaces[observation_key].low.size # create a replay buffer to store data replay_buffer = StepReplayBuffer(max_num_steps=variant["max_num_steps"]) # create a logging instance logger = TensorboardLogger(replay_buffer, variant["logging_dir"]) # a dict to store models for saving to the disk models_dict = {} # build a hierarchical agent that uses sac levels = [] for level in range(variant["num_hierarchy_levels"]): # create policies for each level in the hierarchy policy = TanhGaussian( dense(observation_dim + (0 if level == 0 else observation_dim), 2 * (observation_dim if level == 0 else action_dim), hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict( learning_rate=variant["policy_learning_rate"]), tau=variant["tau"], std=None) # create critics for each level in the hierarchy qf1 = Gaussian( dense(observation_dim + (0 if level == 0 else observation_dim) + (observation_dim if level == 0 else action_dim), 1, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]), tau=variant["tau"], std=1.0) target_qf1 = qf1.clone() # create critics for each level in the hierarchy qf2 = Gaussian( dense(observation_dim + (0 if level == 0 else observation_dim) + (observation_dim if level == 0 else action_dim), 1, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]), tau=variant["tau"], std=1.0) target_qf2 = qf2.clone() # relabel the rewards of the lower level policies relabeled_buffer = (GoalConditioned( replay_buffer, reward_scale=0.0, goal_conditioned_scale=1.0) if level > 0 else replay_buffer) # train the agent using soft actor critic algorithm = SAC(policy, qf1, qf2, target_qf1, target_qf2, relabeled_buffer, reward_scale=variant["reward_scale"], discount=variant["discount"], initial_alpha=variant["initial_alpha"], alpha_optimizer_kwargs=dict( learning_rate=variant["policy_learning_rate"]), target_entropy=(-action_dim), observation_key=observation_key, batch_size=variant["batch_size"], logger=logger, logging_prefix="sac_level{}/".format(level)) # create a single agent to manage the hierarchy levels.append( PolicyAgent( policy, time_skip=variant["time_skip"]**( variant["num_hierarchy_levels"] - 1 - level), goal_skip=(variant["time_skip"]**( variant["num_hierarchy_levels"] - level) if level > 0 else variant["max_path_length"]), algorithm=algorithm, observation_key=observation_key)) models_dict["policy_level{}".format(level)] = policy models_dict["qf1_level{}".format(level)] = qf1 models_dict["qf2_level{}".format(level)] = qf2 models_dict["target_qf1_level{}".format(level)] = target_qf1 models_dict["target_qf2_level{}".format(level)] = target_qf2 # create a hierarchy agent using the list of agents agent = HierarchyAgent(levels) # create a saver to record training progress to the disk saver = LocalSaver(replay_buffer, variant["logging_dir"], **models_dict) # make a sampler to collect data to warm up the hierarchy sampler = ParallelSampler(env, agent, max_path_length=variant["max_path_length"], num_workers=variant["num_workers"]) # collect more training samples sampler.set_weights(agent.get_weights()) paths, returns, num_steps = sampler.collect( variant["num_warm_up_steps"], deterministic=False, keep_data=True, workers_to_use=variant["num_workers"]) # insert the samples into the replay buffer for o, a, r in paths: replay_buffer.insert_path(o, a, r) # train for a specified number of iterations for iteration in range(variant["num_epochs"]): if iteration % variant["num_epochs_per_eval"] == 0: # evaluate the policy at this step sampler.set_weights(agent.get_weights()) paths, eval_returns, num_steps = sampler.collect( variant["num_steps_per_eval"], deterministic=True, keep_data=False, workers_to_use=variant["num_workers"]) logger.record("eval_mean_return", np.mean(eval_returns)) # save the replay buffer and the policies saver.save() # collect more training samples sampler.set_weights(agent.get_weights()) paths, train_returns, num_steps = sampler.collect( variant["num_steps_per_epoch"], deterministic=False, keep_data=True, workers_to_use=1) logger.record("train_mean_return", np.mean(train_returns)) # insert the samples into the replay buffer for o, a, r in paths: replay_buffer.insert_path(o, a, r) # train once each for the number of steps collected for i in range(num_steps): agent.train()
def ddpg( variant, env_class, env_kwargs=None, observation_key="observation", ): # initialize tensorflow and the multiprocessing interface maybe_initialize_process() # run an experiment with multiple agents if env_kwargs is None: env_kwargs = {} # initialize the environment to track the cardinality of actions env = NormalizedEnv(env_class, **env_kwargs) action_dim = env.action_space.low.size observation_dim = env.observation_space.spaces[observation_key].low.size # create a replay buffer to store data replay_buffer = StepReplayBuffer(max_num_steps=variant["max_num_steps"]) # create a logging instance logger = TensorboardLogger(replay_buffer, variant["logging_dir"]) # create policies for each level in the hierarchy policy = Gaussian( dense(observation_dim, action_dim, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"], output_activation="tanh"), optimizer_kwargs=dict(learning_rate=variant["policy_learning_rate"]), tau=variant["tau"], std=variant["exploration_noise_std"]) target_policy = policy.clone() # create critics for each level in the hierarchy qf = Gaussian( dense(observation_dim + action_dim, 1, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]), tau=variant["tau"], std=1.0) target_qf = qf.clone() # train the agent using soft actor critic algorithm = DDPG(policy, target_policy, qf, target_qf, replay_buffer, reward_scale=variant["reward_scale"], discount=variant["discount"], observation_key=observation_key, batch_size=variant["batch_size"], logger=logger, logging_prefix="ddpg/") # create a single agent to manage the hierarchy agent = PolicyAgent(policy, algorithm=algorithm, observation_key=observation_key) # create a saver to record training progress to the disk saver = LocalSaver(replay_buffer, variant["logging_dir"], policy=policy, target_policy=target_policy, qf=qf, target_qf=target_qf) # load the networks if already trained saver.load() # make a sampler to collect data to warm up the hierarchy sampler = ParallelSampler(env, agent, max_path_length=variant["max_path_length"], num_workers=variant["num_workers"]) # collect more training samples sampler.set_weights(agent.get_weights()) paths, returns, num_steps = sampler.collect( variant["num_warm_up_steps"], deterministic=False, keep_data=True, workers_to_use=variant["num_workers"]) # insert the samples into the replay buffer for o, a, r in paths: replay_buffer.insert_path(o, a, r) # train for a specified number of iterations for iteration in range(variant["num_epochs"]): if iteration % variant["num_epochs_per_eval"] == 0: # evaluate the policy at this step sampler.set_weights(agent.get_weights()) paths, eval_returns, num_steps = sampler.collect( variant["num_steps_per_eval"], deterministic=True, keep_data=False, workers_to_use=variant["num_workers"]) logger.record("eval_mean_return", np.mean(eval_returns)) # save the replay buffer and the policies saver.save() # collect more training samples sampler.set_weights(agent.get_weights()) paths, train_returns, num_steps = sampler.collect( variant["num_steps_per_epoch"], deterministic=False, keep_data=True, workers_to_use=1) logger.record("train_mean_return", np.mean(train_returns)) # insert the samples into the replay buffer for o, a, r in paths: replay_buffer.insert_path(o, a, r) # train once each for the number of steps collected for i in range(num_steps): agent.train()
def ppo( variant, env_class, env_kwargs=None, observation_key="observation", ): # initialize tensorflow and the multiprocessing interface maybe_initialize_process() # run an experiment with multiple agents if env_kwargs is None: env_kwargs = {} # initialize the environment to track the cardinality of actions env = NormalizedEnv(env_class, **env_kwargs) action_dim = env.action_space.low.size observation_dim = env.observation_space.spaces[observation_key].low.size # create a replay buffer to store data replay_buffer = PathReplayBuffer( max_path_length=variant["max_path_length"], max_num_paths=variant["max_num_paths"]) # create a logging instance logger = TensorboardLogger(replay_buffer, variant["logging_dir"]) # create policies for each level in the hierarchy policy = Gaussian( dense(observation_dim, action_dim * 2, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict(learning_rate=variant["policy_learning_rate"]), std=None) old_policy = policy.clone() # create critics for each level in the hierarchy vf = Gaussian( dense(observation_dim, 1, hidden_size=variant["hidden_size"], num_hidden_layers=variant["num_hidden_layers"]), optimizer_kwargs=dict(learning_rate=variant["vf_learning_rate"]), std=1.0) # train the agent using soft actor critic algorithm = PPO( policy, old_policy, vf, replay_buffer, reward_scale=variant["reward_scale"], discount=variant["discount"], epsilon=variant["epsilon"], lamb=variant["lamb"], off_policy_updates=variant["off_policy_updates"], critic_updates=variant["critic_updates"], observation_key=observation_key, batch_size=-1, # sample everything in the buffer logger=logger, logging_prefix="ppo/") # create a single agent to manage the hierarchy agent = PolicyAgent(policy, algorithm=algorithm, observation_key=observation_key) # create a saver to record training progress to the disk saver = LocalSaver(replay_buffer, variant["logging_dir"], policy=policy, old_policy=old_policy, vf=vf) # load the networks if already trained saver.load() # make a sampler to collect data to warm up the hierarchy sampler = ParallelSampler(env, agent, max_path_length=variant["max_path_length"], num_workers=variant["num_workers"]) # train for a specified number of iterations for iteration in range(variant["num_epochs"]): # discard all previous samples for on policy learning replay_buffer.empty() if iteration % variant["num_epochs_per_eval"] == 0: # evaluate the policy at this step sampler.set_weights(agent.get_weights()) paths, eval_returns, num_steps = sampler.collect( variant["num_steps_per_eval"], deterministic=True, keep_data=False, workers_to_use=variant["num_workers"]) logger.record("eval_mean_return", np.mean(eval_returns)) # save the replay buffer and the policies saver.save() # collect more training samples sampler.set_weights(agent.get_weights()) paths, train_returns, num_steps = sampler.collect( variant["num_steps_per_epoch"], deterministic=False, keep_data=True, workers_to_use=variant["num_workers"]) logger.record("train_mean_return", np.mean(train_returns)) # insert the samples into the replay buffer for o, a, r in paths: replay_buffer.insert_path(o, a, r) # train once with the on policy data agent.train()