Ejemplo n.º 1
0
def pg(variant,
       env_class,
       observation_key="proprio_observation",
       **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    actor = PolicyGradient(policy,
                           gamma=variant["gamma"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"], policy=policy)

    trainer = LocalTrainer(sampler, [buffer], [actor],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
Ejemplo n.º 2
0
def run_experiment(variant):

    #########
    # SETUP #
    #########

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    experiment_id = variant["experiment_id"]
    logging_dir = "./ant_maze/hiro/sac/{}".format(
        experiment_id)

    max_path_length = variant["max_path_length"]
    max_size = variant["max_size"]

    num_warm_up_paths = variant["num_warm_up_paths"]
    num_exploration_paths = variant["num_exploration_paths"]
    num_evaluation_paths = variant["num_evaluation_paths"]
    num_trains_per_step = variant["num_trains_per_step"]

    update_tuner_every = variant["update_tuner_every"]
    update_actor_every = variant["update_actor_every"]

    batch_size = variant["batch_size"]
    num_steps = variant["num_steps"]

    monitor = LocalMonitor(logging_dir)

    env = NormalizedEnv(
        AntMazeEnv(**variant["env_kwargs"]),
        reward_scale=(1 / max_path_length))

    ##################
    # LOWER POLICIES #
    ##################

    lower_policy = Dense(
        [256, 256, 4],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    lower_target_policy = Dense(
        [256, 256, 4],
        tau=1e-1,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    #########################
    # LOWER VALUE FUNCTIONS #
    #########################

    lower_qf = Dense(
        [256, 256, 1],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs={"lr": 0.0001})

    lower_target_qf = Dense(
        [256, 256, 1],
        tau=1e-1,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs={"lr": 0.0001})

    ##################
    # UPPER POLICIES #
    ##################

    upper_policy = Dense(
        [256, 256, 4],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    upper_target_policy = Dense(
        [256, 256, 4],
        tau=1e-1,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    #########################
    # UPPER VALUE FUNCTIONS #
    #########################

    upper_qf = Dense(
        [256, 256, 1],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs={"lr": 0.0001})

    upper_target_qf = Dense(
        [256, 256, 1],
        tau=1e-1,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs={"lr": 0.0001})

    ####################################
    # OBSERVATION DICTIONARY SELECTORS #
    ####################################

    observation_selector = (
        lambda x: x["proprio_observation"])

    goal_selector = (
        lambda x: x["goal"])

    both_selector = (
        lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1))

    hierarchy_selector = (
        lambda i, x: observation_selector(x) if i == 1 else both_selector(x))

    ##################
    # REPLAY BUFFERS #
    ##################

    lower_buffer = GoalConditionedRelabeler(
        PathBuffer(
            max_size=max_size,
            max_path_length=max_path_length,
            monitor=monitor),
        observation_selector=observation_selector,
        goal_selector=goal_selector)

    upper_buffer = HIRORelabeler(
        lower_policy,
        PathBuffer(
            max_size=max_size,
            max_path_length=max_path_length,
            monitor=monitor),
        observation_selector=observation_selector,
        num_samples=8)

    ############
    # SAMPLERS #
    ############

    sampler = PathSampler(
        env,
        lower_policy,
        lower_buffer,
        upper_policy,
        upper_buffer,
        time_skips=(1, 5),
        max_path_length=max_path_length,
        num_warm_up_paths=num_warm_up_paths,
        num_exploration_paths=num_exploration_paths,
        num_evaluation_paths=num_evaluation_paths,
        selector=hierarchy_selector,
        monitor=monitor)

    #############################
    # LOWER TRAINING ALGORITHMS #
    #############################

    lower_tuner = EntropyTuner(
        lower_policy,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        target=(-2.0),
        update_every=update_tuner_every,
        batch_size=batch_size,
        selector=both_selector,
        monitor=monitor,
        logging_prefix="lower_")

    lower_critic = SoftQNetwork(
        lower_target_policy,
        lower_qf,
        lower_target_qf,
        gamma=0.99,
        clip_radius=0.2,
        std=0.1,
        log_alpha=lower_tuner.get_tuning_variable(),
        batch_size=batch_size,
        selector=both_selector,
        monitor=monitor,
        logging_prefix="lower_")

    lower_actor = SoftActorCritic(
        lower_policy,
        lower_target_policy,
        lower_critic,
        log_alpha=lower_tuner.get_tuning_variable(),
        update_every=update_actor_every,
        batch_size=batch_size,
        selector=both_selector,
        monitor=monitor,
        logging_prefix="lower_")

    lower_algorithm = MultiAlgorithm(lower_actor, lower_critic, lower_tuner)

    #############################
    # UPPER TRAINING ALGORITHMS #
    #############################

    upper_tuner = EntropyTuner(
        upper_policy,
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=0.0001),
        target=(-2.0),
        update_every=update_tuner_every,
        batch_size=batch_size,
        selector=observation_selector,
        monitor=monitor,
        logging_prefix="upper_")

    upper_critic = SoftQNetwork(
        upper_target_policy,
        upper_qf,
        upper_target_qf,
        gamma=0.99,
        clip_radius=0.2,
        std=0.1,
        log_alpha=upper_tuner.get_tuning_variable(),
        batch_size=batch_size,
        selector=observation_selector,
        monitor=monitor,
        logging_prefix="upper_")

    upper_actor = SoftActorCritic(
        upper_policy,
        upper_target_policy,
        upper_critic,
        log_alpha=upper_tuner.get_tuning_variable(),
        update_every=update_actor_every,
        batch_size=batch_size,
        selector=observation_selector,
        monitor=monitor,
        logging_prefix="upper_")

    upper_algorithm = MultiAlgorithm(upper_actor, upper_critic, upper_tuner)

    ##################
    # START TRAINING #
    ##################

    saver = Saver(
        logging_dir,
        lower_policy=lower_policy,
        lower_target_policy=lower_target_policy,
        lower_qf=lower_qf,
        lower_target_qf=lower_target_qf,
        upper_policy=upper_policy,
        upper_target_policy=upper_target_policy,
        upper_qf=upper_qf,
        upper_target_qf=upper_target_qf)

    trainer = LocalTrainer(
        sampler,
        lower_buffer,
        lower_algorithm,
        upper_buffer,
        upper_algorithm,
        num_steps=num_steps,
        num_trains_per_step=num_trains_per_step,
        save_function=saver,
        monitor=monitor)

    trainer.train()
Ejemplo n.º 3
0
def hac(
    variant,
    env_class,
    observation_key="proprio_observation",
    goal_key="goal",
    **env_kwargs
):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    observation_selector = (
        lambda x: x[observation_key])

    goal_selector = (
        lambda x: x[goal_key])

    both_selector = (
        lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1))

    hierarchy_selector = (
        lambda i, x: observation_selector(x) if i == 1 else both_selector(x))

    def relabel_goal(goal, observation):
        observation[goal_key] = goal
        return observation

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs)
    action_dim = np.prod(env.action_space.shape)
    goal_dim = np.prod(env.observation_space[observation_key].shape)

    lower_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    lower_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    lower_target_qf = lower_qf.clone()

    lower_critic = QNetwork(
        lower_policy,
        lower_qf,
        lower_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_actor = DDPG(
        lower_policy,
        lower_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_buffer = GoalConditionedRelabeler(
        HindsightRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            time_skip=variant["time_skip"],
            observation_selector=observation_selector,
            goal_selector=goal_selector,
            goal_assigner=relabel_goal,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        goal_selector=goal_selector)

    lower_buffer = OffPolicyBuffer(lower_buffer)

    upper_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * goal_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    upper_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    upper_target_qf = upper_qf.clone()

    upper_critic = QNetwork(
        upper_policy,
        upper_qf,
        upper_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_actor = DDPG(
        upper_policy,
        upper_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_buffer = SubgoalTestingRelabeler(
        HACRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            observation_selector=observation_selector,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        threshold=variant["threshold"],
        penalty=variant["penalty"],
        relabel_probability=variant["relabel_probability"])

    upper_buffer = OffPolicyBuffer(upper_buffer)

    sampler = ParallelSampler(
        env,
        [lower_policy, upper_policy],
        [lower_buffer, upper_buffer],
        time_skips=(1, variant["time_skip"]),
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=hierarchy_selector,
        monitor=monitor)

    saver = LocalSaver(
        variant["logging_dir"],
        lower_policy=lower_policy,
        lower_qf=lower_qf,
        lower_target_qf=lower_target_qf,
        upper_policy=upper_policy,
        upper_qf=upper_qf,
        upper_target_qf=upper_target_qf)

    trainer = LocalTrainer(
        sampler,
        [lower_buffer, lower_buffer, lower_buffer, upper_buffer, upper_buffer, upper_buffer],
        [upper_actor, upper_critic, lower_actor, lower_critic],
        num_steps=variant["num_steps"],
        num_trains_per_step=variant["num_trains_per_step"],
        saver=saver,
        monitor=monitor)

    trainer.train()
Ejemplo n.º 4
0
def sac(variant,
        env_class,
        observation_key="proprio_observation",
        **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    qf1 = Dense([variant["hidden_size"], variant["hidden_size"], 1],
                tau=variant["tau"],
                optimizer_class=tf.keras.optimizers.Adam,
                optimizer_kwargs=dict(lr=variant["learning_rate"]))

    qf2 = qf1.clone()
    target_qf1 = qf1.clone()
    target_qf2 = qf1.clone()

    tuner = EntropyTuner(policy,
                         optimizer_class=tf.keras.optimizers.Adam,
                         optimizer_kwargs=dict(lr=variant["learning_rate"]),
                         target=(-action_dim),
                         batch_size=variant["batch_size"],
                         monitor=monitor)

    critic1 = SoftQNetwork(policy,
                           qf1,
                           target_qf1,
                           gamma=variant["gamma"],
                           log_alpha=tuner.get_tuning_variable(),
                           bellman_weight=variant["bellman_weight"],
                           discount_weight=variant["discount_weight"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    critic2 = SoftQNetwork(policy,
                           qf2,
                           target_qf2,
                           gamma=variant["gamma"],
                           log_alpha=tuner.get_tuning_variable(),
                           bellman_weight=variant["bellman_weight"],
                           discount_weight=variant["discount_weight"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    critic = TwinCritic(critic1, critic2)

    actor = SoftActorCritic(policy,
                            critic,
                            log_alpha=tuner.get_tuning_variable(),
                            batch_size=variant["batch_size"],
                            monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    step_buffer = OffPolicyBuffer(buffer)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       qf1=qf1,
                       target_qf1=target_qf1,
                       qf2=qf2,
                       target_qf2=target_qf2)

    trainer = LocalTrainer(sampler, [step_buffer, step_buffer, step_buffer],
                           [actor, critic, tuner],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
Ejemplo n.º 5
0
def trpo(variant,
         env_class,
         observation_key="proprio_observation",
         **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    vf = Dense([variant["hidden_size"], variant["hidden_size"], 1],
               tau=variant["tau"],
               optimizer_class=tf.keras.optimizers.Adam,
               optimizer_kwargs=dict(lr=variant["learning_rate"]))

    old_policy = policy.clone()
    target_vf = vf.clone()

    policy = KLConstraint(LineSearch(NaturalGradient(policy, return_sAs=True),
                                     use_sAs=True),
                          old_policy,
                          delta=variant["delta"])

    tuner = EntropyTuner(policy,
                         optimizer_class=tf.keras.optimizers.Adam,
                         optimizer_kwargs=dict(lr=variant["learning_rate"]),
                         target=(-action_dim),
                         batch_size=variant["batch_size"],
                         monitor=monitor)

    critic = SoftValueNetwork(policy,
                              vf,
                              target_vf,
                              gamma=variant["gamma"],
                              log_alpha=tuner.get_tuning_variable(),
                              bellman_weight=variant["bellman_weight"],
                              discount_weight=variant["discount_weight"],
                              batch_size=variant["batch_size"],
                              monitor=monitor)

    critic = GAE(critic, gamma=variant["gamma"], lamb=variant["lamb"])

    actor = ImportanceSampling(policy,
                               old_policy,
                               critic,
                               gamma=variant["gamma"],
                               old_update_every=variant["num_trains_per_step"],
                               batch_size=variant["batch_size"],
                               monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       old_policy=old_policy,
                       vf=vf,
                       target_vf=target_vf)

    trainer = LocalTrainer(sampler, [buffer, buffer, buffer],
                           [actor, critic, tuner],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()