コード例 #1
0
def hac(
    variant,
    env_class,
    observation_key="proprio_observation",
    goal_key="goal",
    **env_kwargs
):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    observation_selector = (
        lambda x: x[observation_key])

    goal_selector = (
        lambda x: x[goal_key])

    both_selector = (
        lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1))

    hierarchy_selector = (
        lambda i, x: observation_selector(x) if i == 1 else both_selector(x))

    def relabel_goal(goal, observation):
        observation[goal_key] = goal
        return observation

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs)
    action_dim = np.prod(env.action_space.shape)
    goal_dim = np.prod(env.observation_space[observation_key].shape)

    lower_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    lower_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    lower_target_qf = lower_qf.clone()

    lower_critic = QNetwork(
        lower_policy,
        lower_qf,
        lower_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_actor = DDPG(
        lower_policy,
        lower_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_buffer = GoalConditionedRelabeler(
        HindsightRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            time_skip=variant["time_skip"],
            observation_selector=observation_selector,
            goal_selector=goal_selector,
            goal_assigner=relabel_goal,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        goal_selector=goal_selector)

    lower_buffer = OffPolicyBuffer(lower_buffer)

    upper_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * goal_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    upper_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    upper_target_qf = upper_qf.clone()

    upper_critic = QNetwork(
        upper_policy,
        upper_qf,
        upper_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_actor = DDPG(
        upper_policy,
        upper_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_buffer = SubgoalTestingRelabeler(
        HACRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            observation_selector=observation_selector,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        threshold=variant["threshold"],
        penalty=variant["penalty"],
        relabel_probability=variant["relabel_probability"])

    upper_buffer = OffPolicyBuffer(upper_buffer)

    sampler = ParallelSampler(
        env,
        [lower_policy, upper_policy],
        [lower_buffer, upper_buffer],
        time_skips=(1, variant["time_skip"]),
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=hierarchy_selector,
        monitor=monitor)

    saver = LocalSaver(
        variant["logging_dir"],
        lower_policy=lower_policy,
        lower_qf=lower_qf,
        lower_target_qf=lower_target_qf,
        upper_policy=upper_policy,
        upper_qf=upper_qf,
        upper_target_qf=upper_target_qf)

    trainer = LocalTrainer(
        sampler,
        [lower_buffer, lower_buffer, lower_buffer, upper_buffer, upper_buffer, upper_buffer],
        [upper_actor, upper_critic, lower_actor, lower_critic],
        num_steps=variant["num_steps"],
        num_trains_per_step=variant["num_trains_per_step"],
        saver=saver,
        monitor=monitor)

    trainer.train()
コード例 #2
0
def ddpg(variant,
         env_class,
         observation_key="proprio_observation",
         **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    qf = Dense([variant["hidden_size"], variant["hidden_size"], 1],
               tau=variant["tau"],
               optimizer_class=tf.keras.optimizers.Adam,
               optimizer_kwargs=dict(lr=variant["learning_rate"]))

    target_qf = qf.clone()

    critic = QNetwork(policy,
                      qf,
                      target_qf,
                      gamma=variant["gamma"],
                      bellman_weight=variant["bellman_weight"],
                      discount_weight=variant["discount_weight"],
                      batch_size=variant["batch_size"],
                      monitor=monitor)

    actor = DDPG(policy,
                 critic,
                 batch_size=variant["batch_size"],
                 update_every=variant["num_trains_per_step"],
                 monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    step_buffer = OffPolicyBuffer(buffer)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       qf=qf,
                       target_qf1=target_qf)

    trainer = LocalTrainer(sampler, [step_buffer, step_buffer],
                           [actor, critic],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
コード例 #3
0
def ppo(variant,
        env_class,
        observation_key="proprio_observation",
        **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    vf = Dense([variant["hidden_size"], variant["hidden_size"], 1],
               tau=variant["tau"],
               optimizer_class=tf.keras.optimizers.Adam,
               optimizer_kwargs=dict(lr=variant["learning_rate"]))

    old_policy = policy.clone()
    target_vf = vf.clone()

    tuner = EntropyTuner(policy,
                         optimizer_class=tf.keras.optimizers.Adam,
                         optimizer_kwargs=dict(lr=variant["learning_rate"]),
                         target=(-action_dim),
                         batch_size=variant["batch_size"],
                         monitor=monitor)

    critic = SoftValueNetwork(policy,
                              vf,
                              target_vf,
                              gamma=variant["gamma"],
                              log_alpha=tuner.get_tuning_variable(),
                              bellman_weight=variant["bellman_weight"],
                              discount_weight=variant["discount_weight"],
                              batch_size=variant["batch_size"],
                              monitor=monitor)

    critic = GAE(critic, gamma=variant["gamma"], lamb=variant["lamb"])

    actor = PPO(policy,
                old_policy,
                critic,
                gamma=variant["gamma"],
                epsilon=variant["epsilon"],
                old_update_every=variant["num_trains_per_step"],
                batch_size=variant["batch_size"],
                monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       old_policy=old_policy,
                       vf=vf,
                       target_vf=target_vf)

    trainer = LocalTrainer(sampler, [buffer, buffer, buffer],
                           [actor, critic, tuner],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()