Ejemplo n.º 1
0
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize):

    tf.set_random_seed(seed=seed)
    env = GymEnv('MountainCarContinuous-v0')
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]

    env.env.seed(seed)
    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = entropy_coeff
    dynamic_coeff = dynamic_coeff
    # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff)
    env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs)
    env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id

    os.makedirs(root_dir, exist_ok=True)
    env_dir = os.path.join(root_dir, env_id)
    os.makedirs(env_dir, exist_ok=True)
    current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed))
    mylogger.make_log_dir(current_log_dir)

    # env_id = 'Test'

    print(env_id)
    print('environment set done')

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff,
        clip_norm=clip_norm,
    )

    # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss")
    # mylogger.make_log_dir(name)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 2
0
def run_experiment(param):
    # instructive = 0.5
    # decay = 3e-6
    decay = 5e-4
    instructive = 0.5

    random_arm_init = [-0.05, 0.05]
    render = False
    reward_shaping = False
    horizon = 250

    env = normalize(
        CRL4DOFWrapper(
            # IKWrapper(
            SawyerPrimitivePick(
                instructive=instructive,
                decay=decay,
                random_arm_init=random_arm_init,
                has_renderer=render,
                reward_shaping=reward_shaping,
                horizon=horizon,
                has_offscreen_renderer=False,
                use_camera_obs=False,
                use_object_obs=True,
                control_freq=100,
            ),
            use_gripper=True))
    # )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 3e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 3
0
def run_experiment(variant):
    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Session() as sess:
        data = joblib.load(variant['snapshot_filename'])
        policy = data['policy']
        env = data['env']

        num_skills = data['policy'].observation_space.flat_dim - data[
            'env'].spec.observation_space.flat_dim
        best_z = get_best_skill(policy, env, num_skills,
                                variant['max_path_length'])
        fixed_z_env = FixedOptionEnv(env, num_skills, best_z)

        tf.logging.info('Finetuning best skill...')

        pool = SimpleReplayBuffer(
            env_spec=fixed_z_env.spec,
            max_replay_buffer_size=variant['max_pool_size'],
        )

        base_kwargs = dict(
            min_pool_size=variant['max_path_length'],
            epoch_length=variant['epoch_length'],
            n_epochs=variant['n_epochs'],
            max_path_length=variant['max_path_length'],
            batch_size=variant['batch_size'],
            n_train_repeat=variant['n_train_repeat'],
            eval_render=False,
            eval_n_episodes=1,
            eval_deterministic=True,
        )

        M = variant['layer_size']

        if variant['use_pretrained_values']:
            qf = data['qf']
            vf = data['vf']
        else:
            del data['qf']
            del data['vf']

            qf = NNQFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='qf-finetune',
            )

            vf = NNVFunction(
                env_spec=fixed_z_env.spec,
                hidden_layer_sizes=[M, M],
                var_scope='vf-finetune',
            )

        algorithm = SAC(
            base_kwargs=base_kwargs,
            env=fixed_z_env,
            policy=policy,
            pool=pool,
            qf=qf,
            vf=vf,
            lr=variant['lr'],
            scale_reward=variant['scale_reward'],
            discount=variant['discount'],
            tau=variant['tau'],
            save_full_state=False,
        )

        algorithm.train()
Ejemplo n.º 4
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 5
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    elif variant["env_name"] == "Point2D-v0":
        import sac.envs.point2d_env
        env = GymEnv(variant["env_name"])
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(min_pool_size=variant['max_path_length'],
                       epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       max_path_length=variant['max_path_length'],
                       batch_size=variant['batch_size'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=SimpleSampler(
                           max_path_length=variant["max_path_length"],
                           min_pool_size=variant["max_path_length"],
                           batch_size=variant["batch_size"]))

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GaussianPolicy(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
        reg=0.001,
    )

    # policy = GMMPolicy(
    #     env_spec=aug_env_spec,
    #     K=variant['K'],
    #     hidden_layer_sizes=[M, M],
    #     qf=qf,
    #     reg=0.001,
    # )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(base_kwargs=base_kwargs,
                      env=env,
                      policy=policy,
                      discriminator=discriminator,
                      pool=pool,
                      qf=qf,
                      vf=vf,
                      lr=variant['lr'],
                      scale_entropy=variant['scale_entropy'],
                      discount=variant['discount'],
                      tau=variant['tau'],
                      num_skills=variant['num_skills'],
                      save_full_state=False,
                      include_actions=variant['include_actions'],
                      learn_p_z=variant['learn_p_z'],
                      add_p_z=variant['add_p_z'],
                      reparametrize=variant["reparametrize"])

    algorithm.train()
Ejemplo n.º 6
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))
    env = DelayedEnv(env, delay=0.01)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = RemoteSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size']
    )

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reparameterize=variant['reparameterize'],
        reg=0.001,
    )
    

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],

        reparameterize=variant['reparameterize'],
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 7
0
class EAC(Serializable):
    """
    CG: the class that implements the EAC algorithm.
    """
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())

    def train(self):
        """
        CG: the function that conducts ensemble training.
        :return: 
        """
        # Set up parameters for the training process.
        self._n_epochs = self._base_ac_params['n_epochs']
        self._epoch_length = self._base_ac_params['epoch_length']
        self._n_train_repeat = self._base_ac_params['n_train_repeat']
        self._n_initial_exploration_steps = self._base_ac_params[
            'n_initial_exploration_steps']
        self._eval_render = self._base_ac_params['eval_render']
        self._eval_n_episodes = self._base_ac_params['eval_n_episodes']
        self._eval_deterministic = self._base_ac_params['eval_deterministic']

        # Set up the evaluation environment.
        if self._eval_n_episodes > 0:
            with tf.variable_scope("low_level_policy", reuse=True):
                self._eval_env = deep_clone(self._env)

        # Import required libraries for training.
        import random
        import math
        import operator
        import numpy as np

        # Initialize the sampler.
        alg_ins = random.choice(self._alg_instances)
        self._sampler.initialize(self._env, alg_ins[0].policy, self._pool)

        # Perform the training/evaluation process.
        num_episode = 0.
        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.log('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    isEpisodeEnd = self._sampler.sample()

                    # If an episode is ended, we need to update performance statistics for each AC instance and
                    # pick randomly another AC instance for next episode of exploration.
                    if isEpisodeEnd:
                        num_episode = num_episode + 1.
                        alg_ins[1] = 0.9 * alg_ins[
                            1] + 0.1 * self._sampler._last_path_return
                        alg_ins[2] = alg_ins[2] + 1.

                        if self._use_ucb:
                            # Select an algorithm instance based on UCB.
                            selected = False
                            for ains in self._alg_instances:
                                if ains[2] < 1.:
                                    alg_ins = ains
                                    selected = True
                                    break
                                else:
                                    ains[3] = ains[1] + math.sqrt(
                                        2.0 * math.log(num_episode) / ains[2])

                            if not selected:
                                alg_ins = max(self._alg_instances,
                                              key=operator.itemgetter(3))

                        else:
                            # Select an algorithm instance uniformly at random.
                            alg_ins = random.choice(self._alg_instances)
                            self._sampler.set_policy(alg_ins[0].policy)

                    if not self._sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    # ================
                    # Perform training.
                    # ================
                    for i in range(self._n_train_repeat):
                        batch = self._sampler.random_batch()

                        # ====================================
                        # Perform training over all AC instances.
                        # ====================================
                        for ains in self._alg_instances:
                            ains[0]._do_training(iteration=t +
                                                 epoch * self._epoch_length,
                                                 batch=batch)

                        # =================================================
                        # Perform training of the action-selection Q-function.
                        # =================================================
                        # Set up the feed dictionary.
                        feed_dict = {
                            self._observations_ens_ph:
                            batch['observations'],
                            self._obv_act_ph:
                            batch['actions'],
                            self._observations_ens_next_ph:
                            batch['next_observations'],
                            self._rewards_ph:
                            batch['rewards'],
                            self._terminals_ph:
                            batch['terminals'],
                        }
                        for i, ains in enumerate(self._alg_instances):
                            with ains[0].policy.deterministic(
                                    self._eval_deterministic):
                                feed_dict[self._acts_next_phs[i]] = ains[
                                    0].policy.get_actions(
                                        batch['next_observations'])

                        # Perform training on the action-selection Q-function.
                        self._sess.run(self._q_ens_train_operator, feed_dict)

                    gt.stamp('train')

                # ============================================================
                # Perform evaluation after one full epoch of training is completed.
                # ============================================================
                if self._eval_n_episodes < 1:
                    continue

                if self._evaluation_strategy == 'ensemble':
                    # Use a whole ensemble of AC instances for evaluation.
                    paths = rollouts(self._eval_env, self,
                                     self._sampler._max_path_length,
                                     self._eval_n_episodes)

                elif self._evaluation_strategy == 'best-policy':
                    # Choose the AC instance with the highest observed performance so far for evaluation.
                    eval_alg_ins = max(self._alg_instances,
                                       key=operator.itemgetter(1))
                    with eval_alg_ins[0].policy.deterministic(
                            self._eval_deterministic):
                        paths = rollouts(self._eval_env,
                                         eval_alg_ins[0].policy,
                                         self._sampler._max_path_length,
                                         self._eval_n_episodes)

                else:
                    paths = None

                if paths is not None:
                    total_returns = [path['rewards'].sum() for path in paths]
                    episode_lengths = [len(p['rewards']) for p in paths]
                    logger.record_tabular('return-average',
                                          np.mean(total_returns))
                    logger.record_tabular('return-min', np.min(total_returns))
                    logger.record_tabular('return-max', np.max(total_returns))
                    logger.record_tabular('return-std', np.std(total_returns))
                    logger.record_tabular('episode-length-avg',
                                          np.mean(episode_lengths))
                    logger.record_tabular('episode-length-min',
                                          np.min(episode_lengths))
                    logger.record_tabular('episode-length-max',
                                          np.max(episode_lengths))
                    logger.record_tabular('episode-length-std',
                                          np.std(episode_lengths))

                    self._eval_env.log_diagnostics(paths)
                    if self._eval_render:
                        self._eval_env.render(paths)

                # Produce log info after each episode of training and evaluation.
                times_itrs = gt.get_times().stamps.itrs
                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self._sampler.log_diagnostics()

                logger.dump_tabular()
                # logger.pop_prefix()

                gt.stamp('eval')

            # Terminate the sampler after the training process is completed.
            self._sampler.terminate()

    def reset(self, dones=None):
        """
        CG: the function required in order to support evaluation (or rollout) by using EAC as an ensemble of AC instances.
        :param dones: 
        :return: 
        """
        pass

    def get_action(self, observation):
        """
        CG: the function required in order to support evaluation (or rollout) by using EAC as an ensemble of AC instances.
        :param observation: 
        :return: 
        """
        import numpy as np

        # Collect all recommended actions.
        recommend_actions = []
        for ains in self._alg_instances:
            with ains[0].policy.deterministic(self._eval_deterministic):
                recommend_actions.append(
                    ains[0].policy.get_action(observation))

        # Determine the value of performing each recommended action.
        # Option 1: select actions based on average Q-value.
        # action_values = np.zeros(len(recommend_actions))
        # for i, ract in enumerate(recommend_actions):
        #
        #     for ains in self._alg_instances:
        #         action_values[i] = action_values[i] + ains[0]._qf1.eval([observation], [ract[0]])[0]
        #     action_values[i] = action_values[i] / len(self._alg_instances)

        # Option 2: select actions based on average rank.
        # from scipy.stats import rankdata
        # action_ranks = np.zeros((len(self._alg_instances), len(self._alg_instances)))
        # for i, ract in enumerate(recommend_actions):
        #     for j, ains in enumerate(self._alg_instances):
        #         action_ranks[j,i] = ains[0]._qf1.eval([observation], [ract[0]])[0]
        #
        # action_ranks = np.array([rankdata(x, method='dense') for x in action_ranks])
        # action_values = np.sum(action_ranks, axis=0)

        # Option 3: select actions based on the ensemble action-selection Q-function.
        action_values = np.zeros(len(recommend_actions))
        for i, ract in enumerate(recommend_actions):
            action_values[i] = self._Q_ensemble.eval([observation],
                                                     [ract[0]])[0]

        # Choose the recommended action with the highest value.
        act_ind = np.argmax(action_values)

        return recommend_actions[act_ind]
def run_experiment(*_):
    env = normalize(VoltVarEnv())

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)

    sampler = SimpleSampler(max_path_length=168,
                            min_pool_size=100,
                            batch_size=256)

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=1000,
        n_epochs=50,
        n_initial_exploration_steps=10000,
        n_train_repeat=1,
        # eval_render=False,
        eval_n_episodes=10,  #50,
        eval_deterministic=True)

    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='qf1')

    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='qf2')

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[64, 32], name='vf')

    qfc1 = NNQFunction(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qfc1')

    qfc2 = NNQFunction(env_spec=env.spec,
                       hidden_layer_sizes=[64, 32],
                       name='qfc2')

    vfc = NNVFunction(env_spec=env.spec,
                      hidden_layer_sizes=[64, 32],
                      name='vfc')

    initial_exploration_policy = UniformPolicy2(env_spec=env.spec)

    # policy = GaussianPolicy(
    #     env_spec=env.spec,
    #     hidden_layer_sizes=[64, 32],
    #     reparameterize=True,
    #     reg=1e-3,
    # )
    policy = CategoricalPolicy(env_spec=env.spec, hidden_layer_sizes=[64, 32])

    algo = SACD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        qfc1=qfc1,
        qfc2=qfc2,
        vf=vf,
        vfc=vfc,
        # plotter=plotter,
        lr=1e-3,
        scale_reward=50,  #2.5,  ## 50 bus 4; 10 bus34
        scale_rewardc=50,  # 2.5,  ## 50 bus 4; 10 bus34
        alpha=1,
        constraint_lr=1e-5,  # 1e-5, #1e-6,#bus34 5e-6;
        # constraint_coeff=1,  # 0,
        # constraint_coeff_targ=1,
        discount=0.99,
        tau=5e-4,  #bus34 5e-4;bus123 2.5e-4,;
        target_update_interval=1,
        #reparameterize=True,
        save_full_state=False)

    algo.train()
Ejemplo n.º 9
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )


    algorithm = DIAYN_BD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],

        # Additional params for behaviour tracking
        metric=variant['metric'],
        env_id=variant['prefix'],
        eval_freq=variant['eval_freq'],
        log_dir=get_logdir(args, variant),

    )

    algorithm.train()
Ejemplo n.º 10
0
def run_experiment(variant):
    # print('MuJoCo')
    # env = normalize(GymEnv('HalfCheetah-v1'))
    # -----------------------------------------------------
    print('Unity3D environment')
    env = UnityEnv('/home/recharrs/Apps/UnityEnvob3/RollerBall.x86_64', time_state=True, idx=args.idx, no_graphics=args.no_graphics)
    # -----------------------------------------------------
    obs_space = env.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low.flatten(), np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high.flatten(), np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=5000,
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=0,       # must set to 0 or it will be error
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],
    )

    algorithm.train()
Ejemplo n.º 11
0
def main(env, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode):

    tf.set_random_seed(seed=seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))

    if policy_mode == GMMPolicy:
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 12
0
def run_experiment(variant):
    sub_level_policies_paths = []
    args = arg()

    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.01, 0.01],
                        y_range=[-0.01, 0.01],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 3000
    else:
        raise ValueError("Domain not available")

    if args.demo:
        pool = DemoReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )
    else:
        pool = SimpleReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )

    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)

    base_kwargs = dict(
        epoch_length=ep_length,
        n_epochs=5e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler,
        use_demos=args.demo,
    )
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 13
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    if variant['num_hidden'] != 256:
        M = variant['num_hidden']
    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf1',
                      batchnormvf=variant['batchnormvf'])
    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf2',
                      batchnormvf=variant['batchnormvf'])
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(M, M),
                     batchnormvf=variant['batchnormvf'],
                     dropoutvf_keep_prob=variant['dropoutvf'])

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(env_spec=env.spec,
                                hidden_layer_sizes=(M, M),
                                reparameterize=policy_params['reparameterize'],
                                todropoutpi=(variant['dropoutpi'] < 1.0),
                                dropoutpi=variant['dropoutpi'],
                                batchnormpi=variant['batchnormpi'])
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    if variant['reward_scale'] < 0:
        scale_rew = algorithm_params['scale_reward']
    else:
        scale_rew = variant['reward_scale']
    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'],
        scale_reward=scale_rew,
        discount=algorithm_params['discount'],
        tau=variant['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
        l1regpi=variant['l1regpi'],
        l2regpi=variant['l2regpi'],
        l1regvf=variant['l1regvf'],
        l2regvf=variant['l2regvf'],
        ent_coef=variant['ent_coef'],
        wclippi=variant['wclippi'],
        wclipvf=variant['wclipvf'],
        dropoutpi=variant['dropoutpi'],
        dropoutvf=variant['dropoutvf'],
        batchnormpi=variant['batchnormpi'],
        batchnormvf=variant['batchnormvf'])

    algorithm._sess.run(tf.global_variables_initializer())

    for v in tf.trainable_variables():
        print(v.name)

    algorithm.train()

    if variant['policypath'] != '':
        save_w_path = os.path.expanduser(variant['policypath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='gaussian_policy'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
    if variant['valuepath'] != '':
        save_w_path = os.path.expanduser(variant['valuepath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf1'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf2'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='vf'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
Ejemplo n.º 14
0
def run_experiment(variant):
    sub_level_policies_paths = []
    # args = parse_args()
    args = arg()

    if args.domain == 'sawyer-reach':
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    else:
        raise ValueError("Domain not available")

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )
    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=2e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler)
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 15
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())
Ejemplo n.º 16
0
def run(variant):
    env = normalize(
        MultiGoalEnv(
            actuation_cost_coeff=1,
            distance_cost_coeff=0.1,
            goal_reward=1,
            init_sigma=0.1,
        ))

    pool = SimpleReplayBuffer(max_replay_buffer_size=1e6, env_spec=env.spec)
    sampler = SimpleSampler(max_path_length=30,
                            min_pool_size=100,
                            batch_size=64)
    base_kwargs = dict(sampler=sampler,
                       epoch_length=1000,
                       n_epochs=1000,
                       n_train_repeat=1,
                       eval_render=True,
                       eval_n_episodes=10,
                       eval_deterministic=False)

    M = 128
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    if variant['policy_type'] == 'gmm':
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[M, M],
                           qf=qf,
                           reg=0.001)
    elif variant['policy_type'] == 'lsp':
        bijector_config = {
            "scale_regularization": 0.0,
            "num_coupling_layers": 2,
            "translation_hidden_sizes": (M, ),
            "scale_hidden_sizes": (M, ),
        }

        policy = LatentSpacePolicy(env_spec=env.spec,
                                   mode="train",
                                   squash=True,
                                   bijector_config=bijector_config,
                                   observations_preprocessor=None)

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    plotter=plotter,
                    lr=3e-4,
                    scale_reward=3.0,
                    discount=0.99,
                    tau=1e-4,
                    save_full_state=True)
    algorithm.train()
Ejemplo n.º 17
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
def run_experiment(variant):
    domain = None
    goal_size = None
    sub_level_policies_paths = []
    if args.domain == 'ant-cross-maze':
        domain = CrossMazeAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'ant-random-goal':
        domain = RandomGoalAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'cheetah-hurdle':
        domain = HalfCheetahHurdleEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/hc/jp-longz/jump.pkl")
    elif args.domain == 'pusher':
        domain = PusherEnv
        goal_size = 0
        sub_level_policies_paths.append(
            "primitive-policies/pusher/bottom/bottom.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/pusher/left/left.pkl")

    env = normalize(domain())  #CrossMazeAntEnv())

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=256)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=5e3,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 19
0
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode, eval_model, e, stochastic):
    tf.set_random_seed(seed=seed)

    env = GymEnv(env_id)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    print("here")

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitationPolicy":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=e)

    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                vf=vf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    # -------------- setting done ------------------------

    # -------------- main process ------------------------
    with algorithm._sess.as_default():
        algorithm._saver.restore(algorithm._sess, eval_model)

        if stochastic:
            knack_file = os.path.join(os.path.dirname(eval_model),
                                      "array/epoch0_2001.npz")
            final_knacks = np.load(knack_file)['knack_kurtosis'][-1]

        env = algorithm._env

        if hasattr(env, "env"):
            env = env.env

        # np.random.seed(seed)
        # env.seed(seed)
        num_data = 50  # num_data * nprocess == 1500
        steps_thresh = 1000
        data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []}
        for i in range(num_data):
            obs = env.reset()
            done = False
            steps = 0
            ret = 0
            tmp_data = {'acs': [], 'obs': [], 'rews': []}
            if stochastic:
                _min = np.min(final_knacks)
                _max = np.max(final_knacks)
            print("start episode {}".format(i))
            while not done:
                steps += 1
                # env.render()
                if stochastic:
                    if hasattr(algorithm.pi, "knack_thresh"):
                        v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack(
                            [obs])
                        knack_value = kurtosis[0]
                        # _min = min(knack_value, _min)
                        # _max = max(knack_value, _max)
                        knack_value = (knack_value - _min) / (_max - _min)
                        if knack_value > 0.8:  ## TODO hyper param
                            print("knack {}".format(knack_value))
                            was = algorithm._policy._is_deterministic
                            algorithm._policy._is_deterministic = True
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                            algorithm._policy._is_deterministic = was
                        else:
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                    else:
                        algorithm._policy._is_deterministic = False
                        action, _ = algorithm.policy.get_action(obs.flatten())
                else:
                    if hasattr(algorithm._policy, "_is_deterministic"):
                        algorithm._policy._is_deterministic = True
                    action, _ = algorithm.policy.get_action(obs.flatten())

                obs_next, rew, done, _ = env.step(action)
                tmp_data['obs'].append(obs)
                tmp_data['acs'].append(action)
                tmp_data['rews'].append(rew)
                ret += rew

                obs = obs_next
                if steps >= steps_thresh:
                    done = True

            data['ep_rets'].append(ret)
            for k, v in tmp_data.items():
                data[k].append(v)

    # np.savez_compressed("a.npz", **data)
    # print("return mean: {}".format(np.mean(data['ep_rets'])))
    return data
def run_experiment(variant):
    low_level_policy = load_low_level_policy(
        policy_path=variant['low_level_policy_path'])

    env_name = variant['env_name']
    env_type = env_name.split('-')[-1]

    env_args = {
        name.replace('env_', '', 1): value
        for name, value in variant.items()
        if name.startswith('env_') and name != 'env_name'
    }
    if 'random-goal' in env_name:
        EnvClass = RANDOM_GOAL_ENVS[env_type]
    elif 'rllab' in variant['env_name']:
        EnvClass = RLLAB_ENVS[variant['env_name']]
    else:
        raise NotImplementedError

    base_env = normalize(EnvClass(**env_args))
    env = HierarchyProxyEnv(wrapped_env=base_env,
                            low_level_policy=low_level_policy)
    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes')
    observations_preprocessor = (
        MLPPreprocessor(env_spec=env.spec,
                        layer_sizes=preprocessing_hidden_sizes,
                        name='high_level_observations_preprocessor')
        if preprocessing_hidden_sizes is not None else None)

    policy_s_t_layers = variant['policy_s_t_layers']
    policy_s_t_units = variant['policy_s_t_units']
    s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

    bijector_config = {
        "scale_regularization": 0.0,
        "num_coupling_layers": variant['policy_coupling_layers'],
        "translation_hidden_sizes": s_t_hidden_sizes,
        "scale_hidden_sizes": s_t_hidden_sizes,
    }

    policy = LatentSpacePolicy(
        env_spec=env.spec,
        mode="train",
        squash=False,
        bijector_config=bijector_config,
        q_function=qf,
        fix_h_on_reset=variant.get('policy_fix_h_on_reset', False),
        observations_preprocessor=observations_preprocessor,
        name="high_level_policy")

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        target_update_interval=variant['target_update_interval'],
        action_prior=variant['action_prior'],
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 21
0
def run_experiment(env, seed, scale_reward,
                   scale_entropy, tsallisQ, num_of_train):
    tf.set_random_seed(seed)

    environmentName = env
    # environmentName = "LunarLanderContinuous-v2"

    print("Experiment: {}".format(environmentName))

    # Set up the PyBullet environment.
    # env = normalize(gym.make(environmentName))
    env = GymEnv(environmentName)

    # Set up the replay buffer.
    pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000)

    # Set up the sampler.
    sampler_params = {
        'max_path_length': 1000,
        'min_pool_size': 1000,
        'batch_size': 256,
    }
    sampler = SimpleSampler(**sampler_params)

    # Set up the value function networks.
    M = 128
    qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1')
    qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2')
    vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M))

    # Set up the policy network.
    # initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec = env.spec,
        hidden_layer_sizes = (M, M),
        reparameterize = False,
        reg = 1e-3,
    )
    # policy = GMMPolicy(
    #     env_spec=env.spec,
    #     K=1,
    #     hidden_layer_sizes=(M, M),
    #     reparameterize=False,
    #     qf=qf1,
    #     reg=1.0e-3,
    # )

    initial_exploration_policy = policy

    base_kwargs = {
        'epoch_length': 1000,
        'n_train_repeat': num_of_train,
        'n_initial_exploration_steps': 1000,
        'eval_render': False,
        'eval_n_episodes': 3,
        'eval_deterministic': True,
    }
    base_kwargs = dict(base_kwargs, sampler = sampler)

    # Define a function for reward scaling.
    def incrementor(itr):
        return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0))

    def decrementor(itr):
        return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0))

    algorithm = TAC(
        base_kwargs = base_kwargs,
        env = env,
        policy = policy,
        initial_exploration_policy = initial_exploration_policy,
        pool = pool,
        qf1 = qf1,
        qf2 = qf2,
        vf = vf,
        lr = 3.0e-4,
        scale_reward = scale_reward,  # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem.
        scale_entropy = scale_entropy,  # CG: default 1.0, 0.8 for the lunar lander problem.
        discount = 0.99,
        tau = 0.01,
        reparameterize = False,
        target_update_interval = 1,
        action_prior = 'uniform',
        save_full_state = False,
        tsallisQ = tsallisQ,
    )

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 22
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    constants.COST_TYPE = variant['algorithm_params']['cost_type']
    register(
        id='MECS-v1',
        entry_point='sac.envs.environment_V_sweep:MEC_v1',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v2',
        entry_point='sac.envs.env_V_sweep_v2:MEC_v2',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v3',
        entry_point='sac.envs.env_V_sweep_v3:MEC_v3',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v4',
        entry_point='sac.envs.env_V_sweep_v4:MEC_v4',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v5',
        entry_point='sac.envs.env_V_sweep_v5:MEC_v5',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v6',
        entry_point='sac.envs.env_V_sweep_v6:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v61',
        entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v7',
        entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v8',
        entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v9',
        entry_point='sac.envs.env_V_sweep_v9:MEC_v9',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v10',
        entry_point='sac.envs.env_V_sweep_v10:MEC_v10',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v11',
        entry_point='sac.envs.env_V_sweep_v11:MEC_v11',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v12',
        entry_point='sac.envs.env_V_sweep_v12:MEC_v12',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v13',
        entry_point='sac.envs.env_V_sweep_v13:MEC_v13',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v14',
        entry_point='sac.envs.env_V_sweep_v14:MEC_v14',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v15',
        entry_point='sac.envs.env_V_sweep_v15:MEC_v15',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v16',
        entry_point='sac.envs.env_V_sweep_v16:MEC_v16',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v17',
        entry_point='sac.envs.env_V_sweep_v17:MEC_v17',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v18',
        entry_point='sac.envs.env_V_sweep_v18:MEC_v18',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v19',
        entry_point='sac.envs.env_V_sweep_v19:MEC_v19',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v20',
        entry_point='sac.envs.env_V_sweep_v20:MEC_v20',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v21',
        entry_point='sac.envs.env_V_sweep_v21:MEC_v21',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v22',
        entry_point='sac.envs.env_V_sweep_v22:MEC_v22',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v23',
        entry_point='sac.envs.env_V_sweep_v23:MEC_v23',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v24',
        entry_point='sac.envs.env_V_sweep_v24:MEC_v24',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v25',
        entry_point='sac.envs.env_V_sweep_v25:MEC_v25',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v26',
        entry_point='sac.envs.env_V_sweep_v26:MEC_v26',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v27',
        entry_point='sac.envs.env_V_sweep_v27:MEC_v27',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v28',
        entry_point='sac.envs.env_V_sweep_v28:MEC_v28',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v29',
        entry_point='sac.envs.env_V_sweep_v29:MEC_v29',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v30',
        entry_point='sac.envs.env_V_sweep_v30:MEC_v30',
        max_episode_steps=5000,
    )

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Ejemplo n.º 23
0
def main(root_dir):
    # tf.set_random_seed(seed=seed)
    # env = GymEnv('MountainCarContinuous-v0')
    env = GymEnv('MountainCarContinuousColor-v0')

    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}

    # TODO Normalize or not
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = 0.
    dynamic_coeff = True

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=10,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff
    )


    algorithm._sess.run(tf.global_variables_initializer())


    # TODO Normalize or not
    # Currently only MountainCar is available
    with algorithm._sess.as_default():
        model_file = os.path.join(root_dir, 'model')
        algorithm._saver.restore(algorithm._sess, model_file)

        for i in range(1):
            obs = env.reset()
            env.env.render()
            sleep(4.0)
            traj = [obs]
            done = False

            while not done:
                env.env.render()
                action = algorithm.policy.get_action(obs.flatten())
                obs, rew, done, _ = env.step(action)
                traj.append(obs.flatten())

            knack, knack_kurtosis = sub_goal_detect(algorithm, traj)
            idxs = np.argsort(knack_kurtosis)
            # idxs = np.argsort(knack)
            print(idxs[::-1])

            COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis))
            for j, s in enumerate(traj):
                env.env.state = np.array(traj[j])
                rgba = COL.get_rgb(knack_kurtosis[j])
                env.env.render(car_rgba=rgba)
            sleep(1.0)

            for idx in idxs[::-1]:
                obs = env.reset()
                env.env.state = np.array(traj[0])
                rgba = COL.get_rgb(knack_kurtosis[0])
                env.env.render(car_rgba=rgba)
                for j in range(idx+1):
                    env.env.state = np.array(traj[j])
                    rgba = COL.get_rgb(knack_kurtosis[j])

                    # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0))
                    env.env.render(car_rgba=rgba)
                sleep(0.5)