Example #1
0
def run(*_):
    env = normalize(
        MultiGoalEnv(
            actuation_cost_coeff=1,
            distance_cost_coeff=0.1,
            goal_reward=1,
            init_sigma=0.1,
        ))

    pool = SimpleReplayBuffer(
        max_replay_buffer_size=1e6,
        env_spec=env.spec,
    )

    base_kwargs = dict(min_pool_size=30,
                       epoch_length=1000,
                       n_epochs=1000,
                       max_path_length=30,
                       batch_size=64,
                       n_train_repeat=1,
                       eval_render=True,
                       eval_n_episodes=10,
                       eval_deterministic=True)

    M = 100
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    policy = GMMPolicy(env_spec=env.spec,
                       K=4,
                       hidden_layer_sizes=[M, M],
                       qf=qf,
                       reg=0.001)

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    plotter=plotter,
                    lr=3E-4,
                    scale_reward=3,
                    discount=0.99,
                    tau=0.001,
                    save_full_state=True)
    algorithm.train()
Example #2
0
def run_experiment(variant):
    env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        save_full_state=False,
    )

    algorithm.train()
Example #3
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task'] #what does task and domain relate to ??
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params)) #TODO- define the baxter environment

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) # TODO:Need to have a HER Pool
    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Example #4
0
def run_experiment(variant):
    sub_level_policies_paths = []
    # args = parse_args()
    args = arg()
    domain = ENVIRONMENTS[args.domain][args.task]
    if args.domain == 'sawyer-reach':
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        lower_goal_range = [-0.1, -0.1, -0.1]
        upper_goal_range = [0.1, 0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    domain(
                        # playable params
                        random_arm_init=random_arm_init,
                        lower_goal_range=lower_goal_range,
                        upper_goal_range=upper_goal_range,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    else:
        raise ValueError("Domain not available")

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )
    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=2e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler)
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Example #5
0
def main(trial,
         use_optuna,
         env,
         seed,
         entropy_coeff,
         n_epochs,
         dynamic_coeff,
         clip_norm,
         normalize_obs,
         buffer_size,
         max_path_length,
         min_pool_size,
         batch_size,
         policy_mode,
         eval_model,
         eval_n_episodes,
         eval_n_frequency,
         exploitation_ratio,
         return_queue=None,
         scale_reward=1.):
    if use_optuna:
        logger.configure(logger.get_dir(),
                         log_suffix="_optune{}".format(trial.number),
                         enable_std_out=False)
        logger.set_level(logger.DISABLED)
    tf.set_random_seed(seed=seed)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitation":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=exploitation_ratio)

    else:
        if policy_mode == "Knack-exploration" or policy_mode == "kurtosis":
            metric = "kurtosis"
        elif policy_mode in [
                "signed_variance", "negative_signed_variance",
                "small_variance", "large_variance"
        ]:
            metric = policy_mode
        elif "kurtosis-" in policy_mode:
            metric = policy_mode
        else:
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-exploration or Knack-exploration or signed_variance or variance"
            )

        policy = KnackBasedPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            vf=vf,
            reg=1e-3,
            squash=True,
            metric=metric,
            exploitation_ratio=exploitation_ratio,
            optuna_trial=trial,
        )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=eval_n_episodes,
        eval_deterministic=True,
        eval_n_frequency=eval_n_frequency)

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=scale_reward,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    if eval_model is None:
        avg_return = algorithm.train()
        if return_queue is not None:
            return_queue.put(avg_return)
        tf.reset_default_graph()
        algorithm._sess.close()
        del algorithm
        return avg_return

    else:
        return algorithm
Example #6
0
def run_experiment(param):
    random_arm_init = [-0.1, 0.1]
    lower_goal_range = [-0.1, -0.1, -0.1]
    upper_goal_range = [0.1, 0.1, 0.1]
    render = False
    reward_shaping = True
    horizon = 250
    env = normalize(
        CRLWrapper(
            # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    lower_goal_range=lower_goal_range,
                    upper_goal_range=upper_goal_range,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
            # )
        )
    )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }


    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 2e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=20,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Example #7
0
def run_experiment(variant):
    #low_level_policy = load_low_level_policy(
    #    policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path'])

    env_name = variant['env_name']
    env_type = env_name.split('-')[-1]

    env_args = {
        name.replace('env_', '', 1): value
        for name, value in variant.items()
        if name.startswith('env_') and name != 'env_name'
    }
    if 'random-goal' in env_name:
        EnvClass = RANDOM_GOAL_ENVS[env_type]
    elif 'rllab' in variant['env_name']:
        EnvClass = RLLAB_ENVS[variant['env_name']]
    else:
        raise NotImplementedError

    base_env = normalize(EnvClass(**env_args))
    env = base_env
    #env = HierarchyProxyEnv(wrapped_env=base_env,
    #                        low_level_policy=low_level_policy)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = variant['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes')
    observations_preprocessor = (
        MLPPreprocessor(env_spec=env.spec,
                        layer_sizes=preprocessing_hidden_sizes,
                        name='high_level_observations_preprocessor')
        if preprocessing_hidden_sizes is not None else None)

    policy_s_t_layers = variant['policy_s_t_layers']
    policy_s_t_units = variant['policy_s_t_units']
    s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

    bijector_config = {
        "scale_regularization": 0.0,
        "num_coupling_layers": variant['policy_coupling_layers'],
        "translation_hidden_sizes": s_t_hidden_sizes,
        "scale_hidden_sizes": s_t_hidden_sizes,
    }

    policy = LatentSpacePolicy(
        env_spec=env.spec,
        mode="train",
        squash=False,
        bijector_config=bijector_config,
        q_function=qf1,
        fix_h_on_reset=variant.get('policy_fix_h_on_reset', False),
        observations_preprocessor=observations_preprocessor,
        name="high_level_policy")

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf1=qf1,
        vf=vf,
        qf2=qf2,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        target_update_interval=variant['target_update_interval'],
        action_prior=variant['action_prior'],
        initial_exploration_policy=initial_exploration_policy,
        save_full_state=False,
    )

    algorithm.train()
Example #8
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    constants.COST_TYPE = variant['algorithm_params']['cost_type']
    register(
        id='MECS-v1',
        entry_point='sac.envs.environment_V_sweep:MEC_v1',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v2',
        entry_point='sac.envs.env_V_sweep_v2:MEC_v2',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v3',
        entry_point='sac.envs.env_V_sweep_v3:MEC_v3',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v4',
        entry_point='sac.envs.env_V_sweep_v4:MEC_v4',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v5',
        entry_point='sac.envs.env_V_sweep_v5:MEC_v5',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v6',
        entry_point='sac.envs.env_V_sweep_v6:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v61',
        entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v7',
        entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v8',
        entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v9',
        entry_point='sac.envs.env_V_sweep_v9:MEC_v9',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v10',
        entry_point='sac.envs.env_V_sweep_v10:MEC_v10',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v11',
        entry_point='sac.envs.env_V_sweep_v11:MEC_v11',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v12',
        entry_point='sac.envs.env_V_sweep_v12:MEC_v12',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v13',
        entry_point='sac.envs.env_V_sweep_v13:MEC_v13',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v14',
        entry_point='sac.envs.env_V_sweep_v14:MEC_v14',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v15',
        entry_point='sac.envs.env_V_sweep_v15:MEC_v15',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v16',
        entry_point='sac.envs.env_V_sweep_v16:MEC_v16',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v17',
        entry_point='sac.envs.env_V_sweep_v17:MEC_v17',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v18',
        entry_point='sac.envs.env_V_sweep_v18:MEC_v18',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v19',
        entry_point='sac.envs.env_V_sweep_v19:MEC_v19',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v20',
        entry_point='sac.envs.env_V_sweep_v20:MEC_v20',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v21',
        entry_point='sac.envs.env_V_sweep_v21:MEC_v21',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v22',
        entry_point='sac.envs.env_V_sweep_v22:MEC_v22',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v23',
        entry_point='sac.envs.env_V_sweep_v23:MEC_v23',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v24',
        entry_point='sac.envs.env_V_sweep_v24:MEC_v24',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v25',
        entry_point='sac.envs.env_V_sweep_v25:MEC_v25',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v26',
        entry_point='sac.envs.env_V_sweep_v26:MEC_v26',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v27',
        entry_point='sac.envs.env_V_sweep_v27:MEC_v27',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v28',
        entry_point='sac.envs.env_V_sweep_v28:MEC_v28',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v29',
        entry_point='sac.envs.env_V_sweep_v29:MEC_v29',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v30',
        entry_point='sac.envs.env_V_sweep_v30:MEC_v30',
        max_episode_steps=5000,
    )

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))
    env = DelayedEnv(env, delay=0.01)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = RemoteSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size']
    )

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reparameterize=variant['reparameterize'],
        reg=0.001,
    )
    

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],

        reparameterize=variant['reparameterize'],
        save_full_state=False,
    )

    algorithm.train()
def run_experiment(variant):
    domain = None
    goal_size = None
    sub_level_policies_paths = []
    if args.domain == 'ant-cross-maze':
        domain = CrossMazeAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'ant-random-goal':
        domain = RandomGoalAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'cheetah-hurdle':
        domain = HalfCheetahHurdleEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/hc/jp-longz/jump.pkl")
    elif args.domain == 'pusher':
        domain = PusherEnv
        goal_size = 0
        sub_level_policies_paths.append(
            "primitive-policies/pusher/bottom/bottom.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/pusher/left/left.pkl")

    env = normalize(domain())  #CrossMazeAntEnv())

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=256)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=5e3,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Example #11
0
def run(variant):
    env = normalize(
        MultiGoalEnv(
            actuation_cost_coeff=1,
            distance_cost_coeff=0.1,
            goal_reward=1,
            init_sigma=0.1,
        ))

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)
    sampler = SimpleSampler(max_path_length=30,
                            min_pool_size=100,
                            batch_size=64)
    base_kwargs = dict(sampler=sampler,
                       epoch_length=1000,
                       n_epochs=1000,
                       n_train_repeat=1,
                       eval_render=True,
                       eval_n_episodes=10,
                       eval_deterministic=False)

    M = 128
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    if variant['policy_type'] == 'gmm':
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[M, M],
                           qf=qf,
                           reg=0.001)
    elif variant['policy_type'] == 'lsp':
        bijector_config = {
            "scale_regularization": 0.0,
            "num_coupling_layers": 2,
            "translation_hidden_sizes": (M, ),
            "scale_hidden_sizes": (M, ),
        }

        policy = LatentSpacePolicy(env_spec=env.spec,
                                   mode="train",
                                   squash=True,
                                   bijector_config=bijector_config,
                                   observations_preprocessor=None)

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    plotter=plotter,
                    lr=3e-4,
                    scale_reward=3.0,
                    discount=0.99,
                    tau=1e-4,
                    save_full_state=True)
    algorithm.train()
Example #12
0
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize):

    tf.set_random_seed(seed=seed)
    env = GymEnv('MountainCarContinuous-v0')
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]

    env.env.seed(seed)
    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = entropy_coeff
    dynamic_coeff = dynamic_coeff
    # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff)
    env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs)
    env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id

    os.makedirs(root_dir, exist_ok=True)
    env_dir = os.path.join(root_dir, env_id)
    os.makedirs(env_dir, exist_ok=True)
    current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed))
    mylogger.make_log_dir(current_log_dir)

    # env_id = 'Test'

    print(env_id)
    print('environment set done')

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff,
        clip_norm=clip_norm,
    )

    # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss")
    # mylogger.make_log_dir(name)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
def main(env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode):

    tf.set_random_seed(seed=seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    if policy_mode == GMMPolicy:
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
def run_experiment(variant):
    sub_level_policies_paths = []
    args = arg()

    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.01, 0.01],
                        y_range=[-0.01, 0.01],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 3000
    else:
        raise ValueError("Domain not available")

    if args.demo:
        pool = DemoReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )
    else:
        pool = SimpleReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )

    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)

    base_kwargs = dict(
        epoch_length=ep_length,
        n_epochs=5e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler,
        use_demos=args.demo,
    )
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    if variant['num_hidden'] != 256:
        M = variant['num_hidden']
    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf1',
                      batchnormvf=variant['batchnormvf'])
    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf2',
                      batchnormvf=variant['batchnormvf'])
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(M, M),
                     batchnormvf=variant['batchnormvf'],
                     dropoutvf_keep_prob=variant['dropoutvf'])

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(env_spec=env.spec,
                                hidden_layer_sizes=(M, M),
                                reparameterize=policy_params['reparameterize'],
                                todropoutpi=(variant['dropoutpi'] < 1.0),
                                dropoutpi=variant['dropoutpi'],
                                batchnormpi=variant['batchnormpi'])
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    if variant['reward_scale'] < 0:
        scale_rew = algorithm_params['scale_reward']
    else:
        scale_rew = variant['reward_scale']
    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'],
        scale_reward=scale_rew,
        discount=algorithm_params['discount'],
        tau=variant['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
        l1regpi=variant['l1regpi'],
        l2regpi=variant['l2regpi'],
        l1regvf=variant['l1regvf'],
        l2regvf=variant['l2regvf'],
        ent_coef=variant['ent_coef'],
        wclippi=variant['wclippi'],
        wclipvf=variant['wclipvf'],
        dropoutpi=variant['dropoutpi'],
        dropoutvf=variant['dropoutvf'],
        batchnormpi=variant['batchnormpi'],
        batchnormvf=variant['batchnormvf'])

    algorithm._sess.run(tf.global_variables_initializer())

    for v in tf.trainable_variables():
        print(v.name)

    algorithm.train()

    if variant['policypath'] != '':
        save_w_path = os.path.expanduser(variant['policypath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='gaussian_policy'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
    if variant['valuepath'] != '':
        save_w_path = os.path.expanduser(variant['valuepath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf1'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf2'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='vf'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
Example #16
0
def run_experiment(variant):
    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Session() as sess:
        data = joblib.load(variant['snapshot_filename'])
        policy = data['policy']
        env = data['env']

        num_skills = data['policy'].observation_space.flat_dim - data[
            'env'].spec.observation_space.flat_dim
        best_z = get_best_skill(policy, env, num_skills,
                                variant['max_path_length'])
        fixed_z_env = FixedOptionEnv(env, num_skills, best_z)

        tf.logging.info('Finetuning best skill...')

        pool = SimpleReplayBuffer(
            env_spec=fixed_z_env.spec,
            max_replay_buffer_size=variant['max_pool_size'],
        )

        base_kwargs = dict(
            min_pool_size=variant['max_path_length'],
            epoch_length=variant['epoch_length'],
            n_epochs=variant['n_epochs'],
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            n_train_repeat=variant['n_train_repeat'],
            eval_render=False,
            eval_n_episodes=1,
            eval_deterministic=True,
        )

        M = variant['layer_size']

        if variant['use_pretrained_values']:
            qf = data['qf']
            vf = data['vf']
        else:
            del data['qf']
            del data['vf']

            qf = NNQFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='qf-finetune',
            )

            vf = NNVFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='vf-finetune',
            )

        algorithm = SAC(
            base_kwargs=base_kwargs,
            env=fixed_z_env,
            policy=policy,
            pool=pool,
            qf=qf,
            vf=vf,
            lr=variant['lr'],
            scale_reward=variant['scale_reward'],
            discount=variant['discount'],
            tau=variant['tau'],
            save_full_state=False,
        )

        algorithm.train()