def run_experiment(*_):

    env = normalize(VoltVarEnv())

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)

    sampler = SimpleSampler(max_path_length=168,
                            min_pool_size=100,
                            batch_size=256)

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=1000,
        n_epochs=1000,
        n_initial_exploration_steps=10000,
        n_train_repeat=1,
        #eval_render=False,
        eval_n_episodes=50,
        eval_deterministic=False)

    qf1 = NNQFunction2(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qf1')

    qf2 = NNQFunction2(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qf2')

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32])

    initial_exploration_policy = UniformPolicy2(env_spec=env.spec)

    policy = CategoricalPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 32),
    )

    algo = SACD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        #plotter=plotter,
        lr=1e-3,
        scale_reward=10.0,
        discount=0.99,
        tau=1e-4,
        target_update_interval=1,
        #reparameterize=False,
        save_full_state=False)

    algo.train()
Ejemplo n.º 2
0
def run(*_):
    env = normalize(
        MultiGoalEnv(
            actuation_cost_coeff=1,
            distance_cost_coeff=0.1,
            goal_reward=1,
            init_sigma=0.1,
        ))

    pool = SimpleReplayBuffer(
        max_replay_buffer_size=1e6,
        env_spec=env.spec,
    )

    base_kwargs = dict(min_pool_size=30,
                       epoch_length=1000,
                       n_epochs=1000,
                       max_path_length=30,
                       batch_size=64,
                       n_train_repeat=1,
                       eval_render=True,
                       eval_n_episodes=10,
                       eval_deterministic=True)

    M = 100
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    policy = GMMPolicy(env_spec=env.spec,
                       K=4,
                       hidden_layer_sizes=[M, M],
                       qf=qf,
                       reg=0.001)

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    plotter=plotter,
                    lr=3E-4,
                    scale_reward=3,
                    discount=0.99,
                    tau=0.001,
                    save_full_state=True)
    algorithm.train()
Ejemplo n.º 3
0
def run_experiment(variant):
    env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 4
0
def run(variant):
    env = normalize(
        MultiGoalEnv(
            actuation_cost_coeff=1,
            distance_cost_coeff=0.1,
            goal_reward=1,
            init_sigma=0.1,
        ))

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)
    sampler = SimpleSampler(max_path_length=30,
                            min_pool_size=100,
                            batch_size=64)
    base_kwargs = dict(sampler=sampler,
                       epoch_length=1000,
                       n_epochs=1000,
                       n_train_repeat=1,
                       eval_render=True,
                       eval_n_episodes=10,
                       eval_deterministic=False)

    M = 128
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    if variant['policy_type'] == 'gmm':
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[M, M],
                           qf=qf,
                           reg=0.001)
    elif variant['policy_type'] == 'lsp':
        bijector_config = {
            "scale_regularization": 0.0,
            "num_coupling_layers": 2,
            "translation_hidden_sizes": (M, ),
            "scale_hidden_sizes": (M, ),
        }

        policy = LatentSpacePolicy(env_spec=env.spec,
                                   mode="train",
                                   squash=True,
                                   bijector_config=bijector_config,
                                   observations_preprocessor=None)

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    plotter=plotter,
                    lr=3e-4,
                    scale_reward=3.0,
                    discount=0.99,
                    tau=1e-4,
                    save_full_state=True)
    algorithm.train()
Ejemplo n.º 5
0
def run_experiment(variant):
    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Session() as sess:
        data = joblib.load(variant['snapshot_filename'])
        policy = data['policy']
        env = data['env']

        num_skills = data['policy'].observation_space.flat_dim - data[
            'env'].spec.observation_space.flat_dim
        best_z = get_best_skill(policy, env, num_skills,
                                variant['max_path_length'])
        fixed_z_env = FixedOptionEnv(env, num_skills, best_z)

        tf.logging.info('Finetuning best skill...')

        pool = SimpleReplayBuffer(
            env_spec=fixed_z_env.spec,
            max_replay_buffer_size=variant['max_pool_size'],
        )

        base_kwargs = dict(
            min_pool_size=variant['max_path_length'],
            epoch_length=variant['epoch_length'],
            n_epochs=variant['n_epochs'],
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            n_train_repeat=variant['n_train_repeat'],
            eval_render=False,
            eval_n_episodes=1,
            eval_deterministic=True,
        )

        M = variant['layer_size']

        if variant['use_pretrained_values']:
            qf = data['qf']
            vf = data['vf']
        else:
            del data['qf']
            del data['vf']

            qf = NNQFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='qf-finetune',
            )

            vf = NNVFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='vf-finetune',
            )

        algorithm = SAC(
            base_kwargs=base_kwargs,
            env=fixed_z_env,
            policy=policy,
            pool=pool,
            qf=qf,
            vf=vf,
            lr=variant['lr'],
            scale_reward=variant['scale_reward'],
            discount=variant['discount'],
            tau=variant['tau'],
            save_full_state=False,
        )

        algorithm.train()
Ejemplo n.º 6
0
def run_experiment(variant):
    sub_level_policies_paths = []
    # args = parse_args()
    args = arg()
    domain = ENVIRONMENTS[args.domain][args.task]
    if args.domain == 'sawyer-reach':
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        lower_goal_range = [-0.1, -0.1, -0.1]
        upper_goal_range = [0.1, 0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    domain(
                        # playable params
                        random_arm_init=random_arm_init,
                        lower_goal_range=lower_goal_range,
                        upper_goal_range=upper_goal_range,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    else:
        raise ValueError("Domain not available")

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )
    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=2e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler)
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 7
0
def run_experiment(param):
    random_arm_init = [-0.1, 0.1]
    lower_goal_range = [-0.1, -0.1, -0.1]
    upper_goal_range = [0.1, 0.1, 0.1]
    render = False
    reward_shaping = True
    horizon = 250
    env = normalize(
        CRLWrapper(
            # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    lower_goal_range=lower_goal_range,
                    upper_goal_range=upper_goal_range,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
            # )
        )
    )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }


    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 2e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=20,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 8
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    constants.COST_TYPE = variant['algorithm_params']['cost_type']
    register(
        id='MECS-v1',
        entry_point='sac.envs.environment_V_sweep:MEC_v1',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v2',
        entry_point='sac.envs.env_V_sweep_v2:MEC_v2',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v3',
        entry_point='sac.envs.env_V_sweep_v3:MEC_v3',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v4',
        entry_point='sac.envs.env_V_sweep_v4:MEC_v4',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v5',
        entry_point='sac.envs.env_V_sweep_v5:MEC_v5',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v6',
        entry_point='sac.envs.env_V_sweep_v6:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v61',
        entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v7',
        entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v8',
        entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v9',
        entry_point='sac.envs.env_V_sweep_v9:MEC_v9',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v10',
        entry_point='sac.envs.env_V_sweep_v10:MEC_v10',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v11',
        entry_point='sac.envs.env_V_sweep_v11:MEC_v11',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v12',
        entry_point='sac.envs.env_V_sweep_v12:MEC_v12',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v13',
        entry_point='sac.envs.env_V_sweep_v13:MEC_v13',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v14',
        entry_point='sac.envs.env_V_sweep_v14:MEC_v14',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v15',
        entry_point='sac.envs.env_V_sweep_v15:MEC_v15',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v16',
        entry_point='sac.envs.env_V_sweep_v16:MEC_v16',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v17',
        entry_point='sac.envs.env_V_sweep_v17:MEC_v17',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v18',
        entry_point='sac.envs.env_V_sweep_v18:MEC_v18',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v19',
        entry_point='sac.envs.env_V_sweep_v19:MEC_v19',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v20',
        entry_point='sac.envs.env_V_sweep_v20:MEC_v20',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v21',
        entry_point='sac.envs.env_V_sweep_v21:MEC_v21',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v22',
        entry_point='sac.envs.env_V_sweep_v22:MEC_v22',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v23',
        entry_point='sac.envs.env_V_sweep_v23:MEC_v23',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v24',
        entry_point='sac.envs.env_V_sweep_v24:MEC_v24',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v25',
        entry_point='sac.envs.env_V_sweep_v25:MEC_v25',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v26',
        entry_point='sac.envs.env_V_sweep_v26:MEC_v26',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v27',
        entry_point='sac.envs.env_V_sweep_v27:MEC_v27',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v28',
        entry_point='sac.envs.env_V_sweep_v28:MEC_v28',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v29',
        entry_point='sac.envs.env_V_sweep_v29:MEC_v29',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v30',
        entry_point='sac.envs.env_V_sweep_v30:MEC_v30',
        max_episode_steps=5000,
    )

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 9
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    elif variant["env_name"] == "Point2D-v0":
        import sac.envs.point2d_env
        env = GymEnv(variant["env_name"])
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(min_pool_size=variant['max_path_length'],
                       epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       max_path_length=variant['max_path_length'],
                       batch_size=variant['batch_size'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=SimpleSampler(
                           max_path_length=variant["max_path_length"],
                           min_pool_size=variant["max_path_length"],
                           batch_size=variant["batch_size"]))

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GaussianPolicy(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
        reg=0.001,
    )

    # policy = GMMPolicy(
    #     env_spec=aug_env_spec,
    #     K=variant['K'],
    #     hidden_layer_sizes=[M, M],
    #     qf=qf,
    #     reg=0.001,
    # )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(base_kwargs=base_kwargs,
                      env=env,
                      policy=policy,
                      discriminator=discriminator,
                      pool=pool,
                      qf=qf,
                      vf=vf,
                      lr=variant['lr'],
                      scale_entropy=variant['scale_entropy'],
                      discount=variant['discount'],
                      tau=variant['tau'],
                      num_skills=variant['num_skills'],
                      save_full_state=False,
                      include_actions=variant['include_actions'],
                      learn_p_z=variant['learn_p_z'],
                      add_p_z=variant['add_p_z'],
                      reparametrize=variant["reparametrize"])

    algorithm.train()
Ejemplo n.º 10
0
def run_experiment(variant):
    # print('MuJoCo')
    # env = normalize(GymEnv('HalfCheetah-v1'))
    # -----------------------------------------------------
    print('Unity3D environment')
    env = UnityEnv('/home/recharrs/Apps/UnityEnvob3/RollerBall.x86_64', time_state=True, idx=args.idx, no_graphics=args.no_graphics)
    # -----------------------------------------------------
    obs_space = env.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low.flatten(), np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high.flatten(), np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=5000,
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=0,       # must set to 0 or it will be error
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],
    )

    algorithm.train()
Ejemplo n.º 11
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )


    algorithm = DIAYN_BD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],

        # Additional params for behaviour tracking
        metric=variant['metric'],
        env_id=variant['prefix'],
        eval_freq=variant['eval_freq'],
        log_dir=get_logdir(args, variant),

    )

    algorithm.train()
Ejemplo n.º 12
0
def main(env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode):

    tf.set_random_seed(seed=seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    if policy_mode == GMMPolicy:
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 13
0
def run_experiment(variant):
    sub_level_policies_paths = []
    args = arg()

    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.01, 0.01],
                        y_range=[-0.01, 0.01],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 3000
    else:
        raise ValueError("Domain not available")

    if args.demo:
        pool = DemoReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )
    else:
        pool = SimpleReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )

    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)

    base_kwargs = dict(
        epoch_length=ep_length,
        n_epochs=5e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler,
        use_demos=args.demo,
    )
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 14
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    if variant['num_hidden'] != 256:
        M = variant['num_hidden']
    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf1',
                      batchnormvf=variant['batchnormvf'])
    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf2',
                      batchnormvf=variant['batchnormvf'])
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(M, M),
                     batchnormvf=variant['batchnormvf'],
                     dropoutvf_keep_prob=variant['dropoutvf'])

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(env_spec=env.spec,
                                hidden_layer_sizes=(M, M),
                                reparameterize=policy_params['reparameterize'],
                                todropoutpi=(variant['dropoutpi'] < 1.0),
                                dropoutpi=variant['dropoutpi'],
                                batchnormpi=variant['batchnormpi'])
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    if variant['reward_scale'] < 0:
        scale_rew = algorithm_params['scale_reward']
    else:
        scale_rew = variant['reward_scale']
    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'],
        scale_reward=scale_rew,
        discount=algorithm_params['discount'],
        tau=variant['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
        l1regpi=variant['l1regpi'],
        l2regpi=variant['l2regpi'],
        l1regvf=variant['l1regvf'],
        l2regvf=variant['l2regvf'],
        ent_coef=variant['ent_coef'],
        wclippi=variant['wclippi'],
        wclipvf=variant['wclipvf'],
        dropoutpi=variant['dropoutpi'],
        dropoutvf=variant['dropoutvf'],
        batchnormpi=variant['batchnormpi'],
        batchnormvf=variant['batchnormvf'])

    algorithm._sess.run(tf.global_variables_initializer())

    for v in tf.trainable_variables():
        print(v.name)

    algorithm.train()

    if variant['policypath'] != '':
        save_w_path = os.path.expanduser(variant['policypath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='gaussian_policy'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
    if variant['valuepath'] != '':
        save_w_path = os.path.expanduser(variant['valuepath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf1'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf2'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='vf'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
def run_experiment(*_):
    env = normalize(VoltVarEnv())

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)

    sampler = SimpleSampler(max_path_length=168,
                            min_pool_size=100,
                            batch_size=256)

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=1000,
        n_epochs=50,
        n_initial_exploration_steps=10000,
        n_train_repeat=1,
        # eval_render=False,
        eval_n_episodes=10,  #50,
        eval_deterministic=True)

    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='qf1')

    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='qf2')

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='vf')

    qfc1 = NNQFunction(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qfc1')

    qfc2 = NNQFunction(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qfc2')

    vfc = NNVFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='vfc')

    initial_exploration_policy = UniformPolicy2(env_spec=env.spec)

    # policy = GaussianPolicy(
    #     env_spec=env.spec,
    #     hidden_layer_sizes=[64, 32],
    #     reparameterize=True,
    #     reg=1e-3,
    # )
    policy = CategoricalPolicy(env_spec=env.spec, hidden_layer_sizes=[64, 32])

    algo = SACD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        qfc1=qfc1,
        qfc2=qfc2,
        vf=vf,
        vfc=vfc,
        # plotter=plotter,
        lr=1e-3,
        scale_reward=50,  #2.5,  ## 50 bus 4; 10 bus34
        scale_rewardc=50,  # 2.5,  ## 50 bus 4; 10 bus34
        alpha=1,
        constraint_lr=1e-5,  # 1e-5, #1e-6,#bus34 5e-6;
        # constraint_coeff=1,  # 0,
        # constraint_coeff_targ=1,
        discount=0.99,
        tau=5e-4,  #bus34 5e-4;bus123 2.5e-4,;
        target_update_interval=1,
        #reparameterize=True,
        save_full_state=False)

    algo.train()
Ejemplo n.º 16
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())
def run_experiment(variant):
    domain = None
    goal_size = None
    sub_level_policies_paths = []
    if args.domain == 'ant-cross-maze':
        domain = CrossMazeAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'ant-random-goal':
        domain = RandomGoalAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'cheetah-hurdle':
        domain = HalfCheetahHurdleEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/hc/jp-longz/jump.pkl")
    elif args.domain == 'pusher':
        domain = PusherEnv
        goal_size = 0
        sub_level_policies_paths.append(
            "primitive-policies/pusher/bottom/bottom.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/pusher/left/left.pkl")

    env = normalize(domain())  #CrossMazeAntEnv())

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=256)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=5e3,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 18
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
Ejemplo n.º 19
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))
    env = DelayedEnv(env, delay=0.01)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = RemoteSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size']
    )

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reparameterize=variant['reparameterize'],
        reg=0.001,
    )
    

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],

        reparameterize=variant['reparameterize'],
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 20
0
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode, eval_model, e, stochastic):
    tf.set_random_seed(seed=seed)

    env = GymEnv(env_id)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    print("here")

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitationPolicy":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=e)

    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                vf=vf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    # -------------- setting done ------------------------

    # -------------- main process ------------------------
    with algorithm._sess.as_default():
        algorithm._saver.restore(algorithm._sess, eval_model)

        if stochastic:
            knack_file = os.path.join(os.path.dirname(eval_model),
                                      "array/epoch0_2001.npz")
            final_knacks = np.load(knack_file)['knack_kurtosis'][-1]

        env = algorithm._env

        if hasattr(env, "env"):
            env = env.env

        # np.random.seed(seed)
        # env.seed(seed)
        num_data = 50  # num_data * nprocess == 1500
        steps_thresh = 1000
        data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []}
        for i in range(num_data):
            obs = env.reset()
            done = False
            steps = 0
            ret = 0
            tmp_data = {'acs': [], 'obs': [], 'rews': []}
            if stochastic:
                _min = np.min(final_knacks)
                _max = np.max(final_knacks)
            print("start episode {}".format(i))
            while not done:
                steps += 1
                # env.render()
                if stochastic:
                    if hasattr(algorithm.pi, "knack_thresh"):
                        v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack(
                            [obs])
                        knack_value = kurtosis[0]
                        # _min = min(knack_value, _min)
                        # _max = max(knack_value, _max)
                        knack_value = (knack_value - _min) / (_max - _min)
                        if knack_value > 0.8:  ## TODO hyper param
                            print("knack {}".format(knack_value))
                            was = algorithm._policy._is_deterministic
                            algorithm._policy._is_deterministic = True
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                            algorithm._policy._is_deterministic = was
                        else:
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                    else:
                        algorithm._policy._is_deterministic = False
                        action, _ = algorithm.policy.get_action(obs.flatten())
                else:
                    if hasattr(algorithm._policy, "_is_deterministic"):
                        algorithm._policy._is_deterministic = True
                    action, _ = algorithm.policy.get_action(obs.flatten())

                obs_next, rew, done, _ = env.step(action)
                tmp_data['obs'].append(obs)
                tmp_data['acs'].append(action)
                tmp_data['rews'].append(rew)
                ret += rew

                obs = obs_next
                if steps >= steps_thresh:
                    done = True

            data['ep_rets'].append(ret)
            for k, v in tmp_data.items():
                data[k].append(v)

    # np.savez_compressed("a.npz", **data)
    # print("return mean: {}".format(np.mean(data['ep_rets'])))
    return data
Ejemplo n.º 21
0
def run_experiment(variant):
    #low_level_policy = load_low_level_policy(
    #    policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path'])

    env_name = variant['env_name']
    env_type = env_name.split('-')[-1]

    env_args = {
        name.replace('env_', '', 1): value
        for name, value in variant.items()
        if name.startswith('env_') and name != 'env_name'
    }
    if 'random-goal' in env_name:
        EnvClass = RANDOM_GOAL_ENVS[env_type]
    elif 'rllab' in variant['env_name']:
        EnvClass = RLLAB_ENVS[variant['env_name']]
    else:
        raise NotImplementedError

    base_env = normalize(EnvClass(**env_args))
    env = base_env
    #env = HierarchyProxyEnv(wrapped_env=base_env,
    #                        low_level_policy=low_level_policy)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = variant['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes')
    observations_preprocessor = (
        MLPPreprocessor(env_spec=env.spec,
                        layer_sizes=preprocessing_hidden_sizes,
                        name='high_level_observations_preprocessor')
        if preprocessing_hidden_sizes is not None else None)

    policy_s_t_layers = variant['policy_s_t_layers']
    policy_s_t_units = variant['policy_s_t_units']
    s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

    bijector_config = {
        "scale_regularization": 0.0,
        "num_coupling_layers": variant['policy_coupling_layers'],
        "translation_hidden_sizes": s_t_hidden_sizes,
        "scale_hidden_sizes": s_t_hidden_sizes,
    }

    policy = LatentSpacePolicy(
        env_spec=env.spec,
        mode="train",
        squash=False,
        bijector_config=bijector_config,
        q_function=qf1,
        fix_h_on_reset=variant.get('policy_fix_h_on_reset', False),
        observations_preprocessor=observations_preprocessor,
        name="high_level_policy")

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf1=qf1,
        vf=vf,
        qf2=qf2,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        target_update_interval=variant['target_update_interval'],
        action_prior=variant['action_prior'],
        initial_exploration_policy=initial_exploration_policy,
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 22
0
def run_experiment(env, seed, scale_reward,
                   scale_entropy, tsallisQ, num_of_train):
    tf.set_random_seed(seed)

    environmentName = env
    # environmentName = "LunarLanderContinuous-v2"

    print("Experiment: {}".format(environmentName))

    # Set up the PyBullet environment.
    # env = normalize(gym.make(environmentName))
    env = GymEnv(environmentName)

    # Set up the replay buffer.
    pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000)

    # Set up the sampler.
    sampler_params = {
        'max_path_length': 1000,
        'min_pool_size': 1000,
        'batch_size': 256,
    }
    sampler = SimpleSampler(**sampler_params)

    # Set up the value function networks.
    M = 128
    qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1')
    qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2')
    vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M))

    # Set up the policy network.
    # initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec = env.spec,
        hidden_layer_sizes = (M, M),
        reparameterize = False,
        reg = 1e-3,
    )
    # policy = GMMPolicy(
    #     env_spec=env.spec,
    #     K=1,
    #     hidden_layer_sizes=(M, M),
    #     reparameterize=False,
    #     qf=qf1,
    #     reg=1.0e-3,
    # )

    initial_exploration_policy = policy

    base_kwargs = {
        'epoch_length': 1000,
        'n_train_repeat': num_of_train,
        'n_initial_exploration_steps': 1000,
        'eval_render': False,
        'eval_n_episodes': 3,
        'eval_deterministic': True,
    }
    base_kwargs = dict(base_kwargs, sampler = sampler)

    # Define a function for reward scaling.
    def incrementor(itr):
        return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0))

    def decrementor(itr):
        return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0))

    algorithm = TAC(
        base_kwargs = base_kwargs,
        env = env,
        policy = policy,
        initial_exploration_policy = initial_exploration_policy,
        pool = pool,
        qf1 = qf1,
        qf2 = qf2,
        vf = vf,
        lr = 3.0e-4,
        scale_reward = scale_reward,  # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem.
        scale_entropy = scale_entropy,  # CG: default 1.0, 0.8 for the lunar lander problem.
        discount = 0.99,
        tau = 0.01,
        reparameterize = False,
        target_update_interval = 1,
        action_prior = 'uniform',
        save_full_state = False,
        tsallisQ = tsallisQ,
    )

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 23
0
def main(trial,
         use_optuna,
         env,
         seed,
         entropy_coeff,
         n_epochs,
         dynamic_coeff,
         clip_norm,
         normalize_obs,
         buffer_size,
         max_path_length,
         min_pool_size,
         batch_size,
         policy_mode,
         eval_model,
         eval_n_episodes,
         eval_n_frequency,
         exploitation_ratio,
         return_queue=None,
         scale_reward=1.):
    if use_optuna:
        logger.configure(logger.get_dir(),
                         log_suffix="_optune{}".format(trial.number),
                         enable_std_out=False)
        logger.set_level(logger.DISABLED)
    tf.set_random_seed(seed=seed)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitation":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=exploitation_ratio)

    else:
        if policy_mode == "Knack-exploration" or policy_mode == "kurtosis":
            metric = "kurtosis"
        elif policy_mode in [
                "signed_variance", "negative_signed_variance",
                "small_variance", "large_variance"
        ]:
            metric = policy_mode
        elif "kurtosis-" in policy_mode:
            metric = policy_mode
        else:
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-exploration or Knack-exploration or signed_variance or variance"
            )

        policy = KnackBasedPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            vf=vf,
            reg=1e-3,
            squash=True,
            metric=metric,
            exploitation_ratio=exploitation_ratio,
            optuna_trial=trial,
        )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=eval_n_episodes,
        eval_deterministic=True,
        eval_n_frequency=eval_n_frequency)

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=scale_reward,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    if eval_model is None:
        avg_return = algorithm.train()
        if return_queue is not None:
            return_queue.put(avg_return)
        tf.reset_default_graph()
        algorithm._sess.close()
        del algorithm
        return avg_return

    else:
        return algorithm
Ejemplo n.º 24
0
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize):

    tf.set_random_seed(seed=seed)
    env = GymEnv('MountainCarContinuous-v0')
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]

    env.env.seed(seed)
    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = entropy_coeff
    dynamic_coeff = dynamic_coeff
    # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff)
    env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs)
    env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id

    os.makedirs(root_dir, exist_ok=True)
    env_dir = os.path.join(root_dir, env_id)
    os.makedirs(env_dir, exist_ok=True)
    current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed))
    mylogger.make_log_dir(current_log_dir)

    # env_id = 'Test'

    print(env_id)
    print('environment set done')

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff,
        clip_norm=clip_norm,
    )

    # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss")
    # mylogger.make_log_dir(name)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 25
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task'] #what does task and domain relate to ??
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params)) #TODO- define the baxter environment

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) # TODO:Need to have a HER Pool
    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 26
0
def main(root_dir):
    # tf.set_random_seed(seed=seed)
    # env = GymEnv('MountainCarContinuous-v0')
    env = GymEnv('MountainCarContinuousColor-v0')

    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}

    # TODO Normalize or not
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = 0.
    dynamic_coeff = True

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=10,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff
    )


    algorithm._sess.run(tf.global_variables_initializer())


    # TODO Normalize or not
    # Currently only MountainCar is available
    with algorithm._sess.as_default():
        model_file = os.path.join(root_dir, 'model')
        algorithm._saver.restore(algorithm._sess, model_file)

        for i in range(1):
            obs = env.reset()
            env.env.render()
            sleep(4.0)
            traj = [obs]
            done = False

            while not done:
                env.env.render()
                action = algorithm.policy.get_action(obs.flatten())
                obs, rew, done, _ = env.step(action)
                traj.append(obs.flatten())

            knack, knack_kurtosis = sub_goal_detect(algorithm, traj)
            idxs = np.argsort(knack_kurtosis)
            # idxs = np.argsort(knack)
            print(idxs[::-1])

            COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis))
            for j, s in enumerate(traj):
                env.env.state = np.array(traj[j])
                rgba = COL.get_rgb(knack_kurtosis[j])
                env.env.render(car_rgba=rgba)
            sleep(1.0)

            for idx in idxs[::-1]:
                obs = env.reset()
                env.env.state = np.array(traj[0])
                rgba = COL.get_rgb(knack_kurtosis[0])
                env.env.render(car_rgba=rgba)
                for j in range(idx+1):
                    env.env.state = np.array(traj[j])
                    rgba = COL.get_rgb(knack_kurtosis[j])

                    # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0))
                    env.env.render(car_rgba=rgba)
                sleep(0.5)