def hac( variant, env_class, observation_key="proprio_observation", goal_key="goal", **env_kwargs ): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) observation_selector = ( lambda x: x[observation_key]) goal_selector = ( lambda x: x[goal_key]) both_selector = ( lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1)) hierarchy_selector = ( lambda i, x: observation_selector(x) if i == 1 else both_selector(x)) def relabel_goal(goal, observation): observation[goal_key] = goal return observation monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) goal_dim = np.prod(env.observation_space[observation_key].shape) lower_policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) lower_qf = Dense( [variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) lower_target_qf = lower_qf.clone() lower_critic = QNetwork( lower_policy, lower_qf, lower_target_qf, gamma=variant["gamma"], bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor, logging_prefix="lower_") lower_actor = DDPG( lower_policy, lower_critic, batch_size=variant["batch_size"], update_every=variant["num_trains_per_step"], monitor=monitor, logging_prefix="lower_") lower_buffer = GoalConditionedRelabeler( HindsightRelabeler( PathBuffer( max_size=variant["max_size"], max_path_length=variant["max_path_length"], monitor=monitor), time_skip=variant["time_skip"], observation_selector=observation_selector, goal_selector=goal_selector, goal_assigner=relabel_goal, relabel_probability=variant["relabel_probability"]), observation_selector=observation_selector, goal_selector=goal_selector) lower_buffer = OffPolicyBuffer(lower_buffer) upper_policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * goal_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) upper_qf = Dense( [variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) upper_target_qf = upper_qf.clone() upper_critic = QNetwork( upper_policy, upper_qf, upper_target_qf, gamma=variant["gamma"], bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor, logging_prefix="upper_") upper_actor = DDPG( upper_policy, upper_critic, batch_size=variant["batch_size"], update_every=variant["num_trains_per_step"], monitor=monitor, logging_prefix="upper_") upper_buffer = SubgoalTestingRelabeler( HACRelabeler( PathBuffer( max_size=variant["max_size"], max_path_length=variant["max_path_length"], monitor=monitor), observation_selector=observation_selector, relabel_probability=variant["relabel_probability"]), observation_selector=observation_selector, threshold=variant["threshold"], penalty=variant["penalty"], relabel_probability=variant["relabel_probability"]) upper_buffer = OffPolicyBuffer(upper_buffer) sampler = ParallelSampler( env, [lower_policy, upper_policy], [lower_buffer, upper_buffer], time_skips=(1, variant["time_skip"]), max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=hierarchy_selector, monitor=monitor) saver = LocalSaver( variant["logging_dir"], lower_policy=lower_policy, lower_qf=lower_qf, lower_target_qf=lower_target_qf, upper_policy=upper_policy, upper_qf=upper_qf, upper_target_qf=upper_target_qf) trainer = LocalTrainer( sampler, [lower_buffer, lower_buffer, lower_buffer, upper_buffer, upper_buffer, upper_buffer], [upper_actor, upper_critic, lower_actor, lower_critic], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()
def ddpg(variant, env_class, observation_key="proprio_observation", **env_kwargs): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) qf = Dense([variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) target_qf = qf.clone() critic = QNetwork(policy, qf, target_qf, gamma=variant["gamma"], bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor) actor = DDPG(policy, critic, batch_size=variant["batch_size"], update_every=variant["num_trains_per_step"], monitor=monitor) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x[observation_key]), monitor=monitor) step_buffer = OffPolicyBuffer(buffer) sampler = ParallelSampler( env, policy, buffer, max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=(lambda i, x: x[observation_key]), monitor=monitor) saver = LocalSaver(variant["logging_dir"], policy=policy, qf=qf, target_qf1=target_qf) trainer = LocalTrainer(sampler, [step_buffer, step_buffer], [actor, critic], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()
def ppo(variant, env_class, observation_key="proprio_observation", **env_kwargs): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) vf = Dense([variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) old_policy = policy.clone() target_vf = vf.clone() tuner = EntropyTuner(policy, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), target=(-action_dim), batch_size=variant["batch_size"], monitor=monitor) critic = SoftValueNetwork(policy, vf, target_vf, gamma=variant["gamma"], log_alpha=tuner.get_tuning_variable(), bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor) critic = GAE(critic, gamma=variant["gamma"], lamb=variant["lamb"]) actor = PPO(policy, old_policy, critic, gamma=variant["gamma"], epsilon=variant["epsilon"], old_update_every=variant["num_trains_per_step"], batch_size=variant["batch_size"], monitor=monitor) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x[observation_key]), monitor=monitor) sampler = ParallelSampler( env, policy, buffer, max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=(lambda i, x: x[observation_key]), monitor=monitor) saver = LocalSaver(variant["logging_dir"], policy=policy, old_policy=old_policy, vf=vf, target_vf=target_vf) trainer = LocalTrainer(sampler, [buffer, buffer, buffer], [actor, critic, tuner], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()