Example #1
0
    def _create_networks(env, config):
        """ Creates all networks necessary for SAC.

        These networks have to be created before instantiating this class and
        used in the constructor.

        TODO: Maybe this should be reworked one day...

        Args:
            config: A configuration dictonary.

        Returns:
            A dictonary which contains the networks.
        """
        obs_dim = int(np.prod(env.observation_space.shape))
        action_dim = int(np.prod(env.action_space.shape))
        net_size = config['rl_algorithm_config']['net_size']
        hidden_sizes = [net_size] * config['rl_algorithm_config']['network_depth']
        # hidden_sizes = [net_size, net_size, net_size]
        qf1 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf2 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf1_target = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        qf2_target = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        ).to(device=ptu.device)
        policy = TanhGaussianPolicy(
            hidden_sizes=hidden_sizes,
            obs_dim=obs_dim,
            action_dim=action_dim,
        ).to(device=ptu.device)

        clip_value = 1.0
        for p in qf1.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))
        for p in qf2.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))
        for p in policy.parameters():
            p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))

        return {'qf1' : qf1, 'qf2' : qf2, 'qf1_target' : qf1_target, 'qf2_target' : qf2_target, 'policy' : policy}
Example #2
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_dim
    qf = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                   output_size=1,
                   **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim + goal_dim,
                   output_size=1,
                   **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    replay_buffer = SimpleHerReplayBuffer(env=env,
                                          **variant['replay_buffer_kwargs'])
    algorithm = HerSac(env=env,
                       policy=policy,
                       qf=qf,
                       vf=vf,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #3
0
File: sac.py Project: jcoreyes/erl
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    vf = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs']
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #4
0
def experiment(variant):
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    env = NormalizedBoxEnv(gym.make('Pointmass-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Example #5
0
def experiment(variant):
    wrapped_env = gym.make(variant['env_name'])
    obs_dim = wrapped_env.observation_space.spaces['observation'].low.size

    net_size = variant['net_size']

    disc = Discriminator(input_size=obs_dim,
                         output_size=variant['disc_kwargs']['num_skills'],
                         hidden_sizes=[net_size, net_size],
                         **variant['disc_kwargs'])

    env = DiscriminatorWrappedEnv(wrapped_env=wrapped_env,
                                  disc=disc,
                                  **variant['env_kwargs'])

    context_dim = env.context_dim
    action_dim = wrapped_env.action_space.low.size

    qf1 = FlattenMlp(
        input_size=obs_dim + context_dim + action_dim,
        output_size=1,
        hidden_sizes=[net_size, net_size],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + context_dim + action_dim,
        output_size=1,
        hidden_sizes=[net_size, net_size],
    )
    vf = FlattenMlp(
        input_size=obs_dim + context_dim,
        hidden_sizes=[net_size, net_size],
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim + context_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    replay_buffer = ObsDictPathReplayBuffer(
        env=env,
        max_path_length=variant['algo_kwargs']['max_path_length'],
        observation_key='observation',
        context_key='context',
        **variant['replay_buffer_kwargs'])
    algorithm = UrlTwinSac(replay_buffer=replay_buffer,
                           url_kwargs=dict(observation_key='observation',
                                           context_key='context',
                                           fitting_period=1,
                                           env_loss_key='discriminator loss'),
                           tsac_kwargs=dict(
                               env=env,
                               policy=policy,
                               qf1=qf1,
                               qf2=qf2,
                               vf=vf,
                           ),
                           **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #6
0
def experiment(variant):
    env = NormalizedBoxEnv(PointEnv(**variant['task_params']))
    ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id'])

    tasks = env.get_all_task_idx()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = 5
    task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    reward_dim = 1

    net_size = variant['net_size']
    # start with linear task encoding
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    task_enc = encoder_model(
            hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better
            input_size=obs_dim + action_dim + reward_dim,
            output_size=task_enc_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = ProtoAgent(
        latent_dim,
        [task_enc, policy, qf1, qf2, vf],
        **variant['algo_params']
    )

    algorithm = ProtoSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:-20]),
        eval_tasks=list(tasks[-20:]),
        nets=[agent, task_enc, policy, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.to()
    algorithm.train()
Example #7
0
def experiment(variant):
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env_specs_vg = VariantGenerator()
    env_spec_constants = {}
    for k, v in env_specs.items():
        if isinstance(v, list):
            env_specs_vg.add(k, v)
        else:
            env_spec_constants[k] = v

    env_specs_list = []
    for es in env_specs_vg.variants():
        del es['_hidden_keys']
        es.update(env_spec_constants)
        env_specs_list.append(es)
    print(env_specs_list)

    print(env_specs_list[0])
    env_sampler = EnvSampler(env_specs_list)

    # set up similar to non-meta version
    sample_env, _ = env_sampler()
    if variant['algo_params']['concat_env_params_to_obs']:
        meta_params_dim = sample_env.env_meta_params.shape[0]
    else:
        meta_params_dim = 0
    obs_dim = int(np.prod(sample_env.observation_space.shape))
    action_dim = int(np.prod(sample_env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + meta_params_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + meta_params_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + meta_params_dim,
        action_dim=action_dim,
    )
    algorithm = MetaSoftActorCritic(env_sampler=env_sampler,
                                    policy=policy,
                                    qf=qf,
                                    vf=vf,
                                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Example #8
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, eval_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], [], []
    for i in range(num_agent):
        policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                    action_dim=action_dim,
                                    **variant['policy_kwargs'])
        eval_policy = MakeDeterministic(policy)
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        target_qf2 = copy.deepcopy(qf1)
        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MASACTrainer(env=expl_env,
                           qf1_n=qf1_n,
                           target_qf1_n=target_qf1_n,
                           qf2_n=qf2_n,
                           target_qf2_n=target_qf2_n,
                           policy_n=policy_n,
                           **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #9
0
def experiment(variant):
    env = Point2DEnv(**variant['env_kwargs'])
    env = FlatGoalEnv(env)
    env = NormalizedBoxEnv(env)

    action_dim = int(np.prod(env.action_space.shape))
    obs_dim = int(np.prod(env.observation_space.shape))

    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    target_qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    target_qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TwinSACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        data_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #10
0
def get_sac(evaluation_environment, parameters):
    """
    :param env - environment to get action shape
    :param parameters: dict with keys -
    hidden_sizes,
    sac_trainer_parameters
    :return: sac_policy, eval_policy, trainer
    """
    obs_dim = evaluation_environment.observation_space.low.size
    action_dim = evaluation_environment.action_space.low.size

    hidden_sizes_qf = parameters['hidden_sizes_qf']
    hidden_sizes_policy = parameters['hidden_sizes_policy']

    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes_qf,
    )

    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes_qf,
    )

    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes_qf,
    )

    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes_qf,
    )

    sac_policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=hidden_sizes_policy,
    )

    eval_policy = MakeDeterministic(sac_policy)

    trainer = SACTrainer(env=evaluation_environment,
                         policy=sac_policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **parameters['trainer_params'])

    return sac_policy, eval_policy, trainer
Example #11
0
def her_twin_sac_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    observation_key = variant.get('observation_key', 'observation')
    desired_goal_key = variant.get('desired_goal_key', 'desired_goal')
    replay_buffer = ObsDictRelabelingBuffer(env=env,
                                            observation_key=observation_key,
                                            desired_goal_key=desired_goal_key,
                                            **variant['replay_buffer_kwargs'])
    obs_dim = env.observation_space.spaces['observation'].low.size
    action_dim = env.action_space.low.size
    goal_dim = env.observation_space.spaces['desired_goal'].low.size
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim + goal_dim,
                   output_size=1,
                   **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = HerTwinSac(env,
                           qf1=qf1,
                           qf2=qf2,
                           vf=vf,
                           policy=policy,
                           replay_buffer=replay_buffer,
                           observation_key=observation_key,
                           desired_goal_key=desired_goal_key,
                           **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        qf1.to(ptu.device)
        qf2.to(ptu.device)
        vf.to(ptu.device)
        policy.to(ptu.device)
        algorithm.to(ptu.device)
    algorithm.train()
Example #12
0
def experiment(variant):
    # env = NormalizedBoxEnv(MultiGoalEnv(
    #     actuation_cost_coeff=10,
    #     distance_cost_coeff=1,
    #     goal_reward=10,
    # ))
    env = NormalizedBoxEnv(HalfCheetahEnv())

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    # qf = ExpectableQF(
    # obs_dim=obs_dim,
    # action_dim=action_dim,
    # hidden_size=100,
    # )
    net_size = variant['net_size']
    qf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    # TODO(vitchyr): just creating the plotter crashes EC2
    # plotter = QFPolicyPlotter(
    # qf=qf,
    # policy=policy,
    # obs_lst=np.array([[-2.5, 0.0],
    # [0.0, 0.0],
    # [2.5, 2.5]]),
    # default_action=[np.nan, np.nan],
    # n_samples=100
    # )
    algorithm = ExpectedSAC(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        # plotter=plotter,
        # render_eval_paths=True,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #13
0
def run_sac(base_expl_env, base_eval_env, variant):
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    num_hidden = variant["num_hidden_layers"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M] * num_hidden)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant["replay_buffer_size"],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.train()
Example #14
0
def experiment(variant):
    env = NormalizedBoxEnv(CartpoleSwingupSparseEnv())
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = NormalizedBoxEnv(Continuous_MountainCarEnv())
    #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv()))
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    skill_dim = 0  #50
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + skill_dim,
        action_dim=action_dim,
        #k=4,
    )
    disc = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=skill_dim if skill_dim > 0 else 1,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        #disc=disc,
        #skill_dim=skill_dim,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #15
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    # ---------
    # env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))

    env = ReacherEnv()
    training_env = ReacherEnv()
    
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    total_meta_variable_dim = 0
    for dims in exp_specs['true_meta_variable_dims']:
        total_meta_variable_dim += sum(dims)

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + total_meta_variable_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + total_meta_variable_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + total_meta_variable_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Example #16
0
def vanilla_nets(env, n_lay_nodes, n_depth, clip_val=1):
    hidden = [n_lay_nodes] * n_depth

    obs_size = env.observation_space.shape[0]
    act_size = env.action_space.shape[0]

    q1_net = FlattenMlp(
        hidden_sizes=hidden,
        input_size=obs_size + act_size,
        output_size=1,
    ).to(device=torch_util.device)

    q2_net = FlattenMlp(
        hidden_sizes=hidden,
        input_size=obs_size + act_size,
        output_size=1,
    ).to(device=torch_util.device)

    policy_net = TanhGaussianPolicy(
        hidden_sizes=hidden, obs_dim=obs_size,
        action_dim=act_size).to(device=torch_util.device)

    target_q1_net = FlattenMlp(
        hidden_sizes=hidden,
        input_size=obs_size + act_size,
        output_size=1,
    ).to(device=torch_util.device)

    target_q2_net = FlattenMlp(
        hidden_sizes=hidden,
        input_size=obs_size + act_size,
        output_size=1,
    ).to(device=torch_util.device)

    nets = [q1_net, q2_net, policy_net]
    for n in nets:
        for p in n.parameters():
            p.register_hook(
                lambda grad: torch.clamp(grad, -clip_val, clip_val))

    return dict(policy_net=policy_net,
                q1_net=q1_net,
                q2_net=q2_net,
                target_q1_net=target_q1_net,
                target_q2_net=target_q2_net)
def experiment(variant):
    env = SawyerHumanControlEnv(action_mode='joint_space_impd',
                                position_action_scale=1,
                                max_speed=0.015)
    # max_speed does not actually do anything, it is now included in the function request_angle_action of sawyer_env_base.

    training_env = env

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']

    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    es = GaussianStrategy(
        action_space=env.action_space,
        **variant['es_kwargs'],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3BC(env=env,
                      policy=policy,
                      qf1=qf1,
                      qf2=qf2,
                      exploration_policy=exploration_policy,
                      **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Example #18
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M, M])
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #19
0
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        # plotter=plotter,
        # render_eval_paths=True,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #20
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))

    num_skills = variant['num_skills']
    '''observation dim includes dim of latent variable'''
    obs_dim = int(np.prod(env.observation_space.shape)) + num_skills
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )

    # TODO: VERIFY THIS
    # num_skills=variant['num_skills']
    discrim = FlattenMlp(hidden_sizes=[net_size, net_size],
                         input_size=obs_dim - num_skills,
                         output_size=num_skills,
                         output_activation=nn.Sigmoid())

    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = DIAYN(env=env,
                      policy=policy,
                      qf=qf,
                      vf=vf,
                      discrim=discrim,
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Example #21
0
def get_sac_trainer(env, hidden_sizes=[256, 256], reward_scale=1):
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    hidden_sizes=hidden_sizes)
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=hidden_sizes,
    )

    trainer = SACTrainer(env=env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         discount=0.99,
                         soft_target_tau=5e-3,
                         target_update_period=1,
                         policy_lr=3E-4,
                         qf_lr=3E-4,
                         reward_scale=reward_scale,
                         use_automatic_entropy_tuning=True)
    return trainer
Example #22
0
def experiment(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(input_size=obs_dim + action_dim,
                   output_size=1,
                   **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #23
0
    def __init__(self, variant, goal, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        self.env = env_producer(variant['env_name'], seed=0, goal=goal)
        obs_dim = int(np.prod(self.env.observation_space.shape))
        action_dim = int(np.prod(self.env.action_space.shape))
        reward_dim = 1

        # instantiate networks
        latent_dim = variant['latent_size']
        context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
            'algo_params'][
                'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
        context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
            'use_information_bottleneck'] else latent_dim
        net_size = variant['net_size']
        recurrent = variant['algo_params']['recurrent']
        encoder_model = RecurrentEncoder if recurrent else MlpEncoder

        context_encoder = encoder_model(
            hidden_sizes=[200, 200, 200],
            input_size=context_encoder_input_dim,
            output_size=context_encoder_output_dim,
        )

        policy = TanhGaussianPolicy(
            hidden_sizes=[net_size, net_size, net_size],
            obs_dim=obs_dim + latent_dim,
            latent_dim=latent_dim,
            action_dim=action_dim,
        )
        self.agent = PEARLAgent(latent_dim, context_encoder, policy,
                                **variant['algo_params'])
        self.num_evals = variant['num_evals']
        self.max_path_length = variant['max_path_length']
Example #24
0
def experiment(variant):

    farmlist_base = [('123.123.123.123', 4)]

    farmer = Farmer(farmlist_base)
    environment = acq_remote_env(farmer)
    env = NormalizedBoxEnv(environment)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        environment_farming=True,
        farmlist_base=farmlist_base,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Example #25
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = TwinSAC(env,
                        policy=policy,
                        qf1=qf1,
                        qf2=qf2,
                        vf=vf,
                        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #26
0
def experiment(variant):
    import gym
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = TwinSAC(
        env=env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    logger.add_text_output('./d_text.txt')
    logger.add_tabular_output('./d_tabular.txt')
    logger.set_snapshot_dir('./snaps')
    farmer = Farmer([('0.0.0.0', 1)])
    remote_env = farmer.force_acq_env()
    remote_env.set_spaces()
    env = NormalizedBoxEnv(remote_env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                training_env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Example #28
0
def experiment(variant):
    # env = normalize(GymEnv(
    #     'HalfCheetah-v1',
    #     force_reset=True,
    #     record_video=False,
    #     record_log=False,
    # ))
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #29
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))
    # env = gym.make('HalfCheetah-v2')

    env = MujocoManipEnv("SawyerBinsCanEnv")  # wrap as a gym env
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
        action_skip=ACTION_SKIP,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Example #30
0
def experiment(variant):
    env = NormalizedBoxEnv(MultiGoalEnv(
        actuation_cost_coeff=10,
        distance_cost_coeff=1,
        goal_reward=10,
    ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    with torch.autograd.profiler.profile() as prof:
        algorithm.train()
    prof.export_chrome_trace("tmp-torch-chrome-trace.prof")