Ejemplo n.º 1
0
    def __init__(self, root_dir):
        self.map_files = glob(os.path.join(root_dir, 'maps/*.npz'))
        data = log_reader(os.path.join(root_dir, 'log.json'))
        self.total_steps = data['total_step']
        self.average_return = data['eval_average_return']
        self.eval_terminal_states = data['eval_terminal_states']
        self.train_terminal_states = data['train_terminal_states']

        from sac.envs import GymEnv
        env = GymEnv('MountainCarContinuousColor-v0')
        low_state = env.env.low_state
        high_state = env.env.high_state
        extent = [low_state[0], high_state[0], high_state[1], low_state[1]]
        aspect = (high_state[0] - low_state[0]) / (high_state[1] -
                                                   low_state[1])

        # figure configuration
        plt.style.use('mystyle3')
        self.fig, self.axes = plt.subplots(2, 2, sharex='col', sharey='row')
        title = [
            'average relative Q(s,a)', 'relative V(s)', 'relative knack map',
            'relative knack map kurtosis'
        ]
        for t, ax in zip(title, self.axes.flatten()):
            ax.set_title(t)

        # prepare to draw updatable map
        # !!!!!!!! set vmin and vmax is important!!!!!!!!!
        # we normalize array in (0., 1.) to visualize
        # extent set tick label range
        tmp = np.zeros([25, 25])
        self.im = np.array([
            ax.imshow(tmp,
                      cmap='Blues',
                      animated=True,
                      vmin=0.,
                      vmax=1.,
                      extent=extent) for ax in self.axes.flatten()
        ]).reshape(2, 2)
        for ax in self.axes.flatten():
            ax.set_aspect(aspect)
            # ax.set_aspect('equal')

        self.frame_skip = 2
Ejemplo n.º 2
0
    def init_figure(self, env_id):
        from sac.envs import GymEnv
        env = GymEnv(env_id)
        low_state = env.env.low_state
        high_state = env.env.high_state
        self.range = [
            [low_state[0], high_state[0]], [low_state[1], high_state[1]]
        ]  # range of x, y coordinate. y range is reversed for visualization
        extent = [low_state[0], high_state[0], high_state[1], low_state[1]]
        aspect = (high_state[0] - low_state[0]) / (high_state[1] -
                                                   low_state[1])

        # figure configuration
        plt.style.use('mystyle3')
        self.fig, self.axes = plt.subplots(2, 2, sharex='col', sharey='row')
        title = [
            'average relative Q(s,a)', 'relative V(s)', 'relative knack map',
            'relative knack map kurtosis'
        ]
        for t, ax in zip(title, self.axes.flatten()):
            ax.set_title(t)

        # prepare to draw updatable map
        # !!!!!!!! set vmin and vmax is important!!!!!!!!!
        # we normalize array in (0., 1.) to visualize
        # extent set tick label range
        tmp = np.zeros([25, 25])
        self.im = np.array([
            ax.imshow(tmp,
                      cmap='Blues',
                      animated=True,
                      vmin=0.,
                      vmax=1.,
                      extent=extent) for ax in self.axes.flatten()
        ]).reshape(2, 2)
        for ax in self.axes.flatten():
            ax.set_aspect(aspect)
Ejemplo n.º 3
0
            data['ep_rets'].append(ret)
            for k, v in tmp_data.items():
                data[k].append(v)

    np.savez_compressed("a.npz", **data)
    print("return mean: {}".format(np.mean(data['ep_rets'])))


if __name__ == '__main__':
    args = parse_args()

    # set environment
    seed = args['seed']
    env_id = args.pop('env_id')
    env = GymEnv(env_id)

    # set log directory
    root_dir = args.pop('root_dir')
    opt_log_name = args.pop('opt_log_name')
    logger2 = mylogger.get_logger()
    if args['eval_model'] is None:
        env_id = env.env_id
        # set log
        current_log_dir = root_dir
        logger2.set_log_dir(current_log_dir, exist_ok=True)
        logger2.set_save_array_flag(args.pop("save_array_flag"))
        if args["use_optuna"]:
            logger.set_level(logger.DISABLED)
        else:
            logger.configure(dir=current_log_dir, enable_std_out=False)
Ejemplo n.º 4
0
from sac.misc.instrument import run_sac_experiment
from sac.misc.utils import timestamp, unflatten
from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy
from sac.misc.sampler import SimpleSampler
from sac.replay_buffers import SimpleReplayBuffer
from sac.value_functions import NNQFunction, NNVFunction
from sac.preprocessors import MLPPreprocessor
from examples.variants_re_1 import parse_domain_and_task, get_variants, get_variants_delayed
from sac.envs import constants
from gym.envs.registration import register


DELAY_CONST = 20
ENVIRONMENTS = {
    'swimmer-gym': {
        'default': lambda: GymEnv('Swimmer-v1'),
        'delayed': lambda: GymEnvDelayed('Swimmer-v1', delay = DELAY_CONST),
    },
    'swimmer-rllab': {
        'default': SwimmerEnv,
        'multi-direction': MultiDirectionSwimmerEnv,
    },
    'ant': {
        'default': lambda: GymEnv('Ant-v1'),
        'multi-direction': MultiDirectionAntEnv,
        'cross-maze': CrossMazeAntEnv,
        'delayed': lambda: GymEnvDelayed('Ant-v1', delay = DELAY_CONST),
    },
    'humanoid-gym': {
        'default': lambda: GymEnv('Humanoid-v1'),
        'delayed': lambda: GymEnvDelayed('Humanoid-v1', delay = DELAY_CONST),
Ejemplo n.º 5
0
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode, eval_model, e, stochastic):
    tf.set_random_seed(seed=seed)

    env = GymEnv(env_id)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    print("here")

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitationPolicy":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=e)

    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                vf=vf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    # -------------- setting done ------------------------

    # -------------- main process ------------------------
    with algorithm._sess.as_default():
        algorithm._saver.restore(algorithm._sess, eval_model)

        if stochastic:
            knack_file = os.path.join(os.path.dirname(eval_model),
                                      "array/epoch0_2001.npz")
            final_knacks = np.load(knack_file)['knack_kurtosis'][-1]

        env = algorithm._env

        if hasattr(env, "env"):
            env = env.env

        # np.random.seed(seed)
        # env.seed(seed)
        num_data = 50  # num_data * nprocess == 1500
        steps_thresh = 1000
        data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []}
        for i in range(num_data):
            obs = env.reset()
            done = False
            steps = 0
            ret = 0
            tmp_data = {'acs': [], 'obs': [], 'rews': []}
            if stochastic:
                _min = np.min(final_knacks)
                _max = np.max(final_knacks)
            print("start episode {}".format(i))
            while not done:
                steps += 1
                # env.render()
                if stochastic:
                    if hasattr(algorithm.pi, "knack_thresh"):
                        v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack(
                            [obs])
                        knack_value = kurtosis[0]
                        # _min = min(knack_value, _min)
                        # _max = max(knack_value, _max)
                        knack_value = (knack_value - _min) / (_max - _min)
                        if knack_value > 0.8:  ## TODO hyper param
                            print("knack {}".format(knack_value))
                            was = algorithm._policy._is_deterministic
                            algorithm._policy._is_deterministic = True
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                            algorithm._policy._is_deterministic = was
                        else:
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                    else:
                        algorithm._policy._is_deterministic = False
                        action, _ = algorithm.policy.get_action(obs.flatten())
                else:
                    if hasattr(algorithm._policy, "_is_deterministic"):
                        algorithm._policy._is_deterministic = True
                    action, _ = algorithm.policy.get_action(obs.flatten())

                obs_next, rew, done, _ = env.step(action)
                tmp_data['obs'].append(obs)
                tmp_data['acs'].append(action)
                tmp_data['rews'].append(rew)
                ret += rew

                obs = obs_next
                if steps >= steps_thresh:
                    done = True

            data['ep_rets'].append(ret)
            for k, v in tmp_data.items():
                data[k].append(v)

    # np.savez_compressed("a.npz", **data)
    # print("return mean: {}".format(np.mean(data['ep_rets'])))
    return data
Ejemplo n.º 6
0
def main(root_dir, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm, regularize):

    tf.set_random_seed(seed=seed)
    env = GymEnv('MountainCarContinuous-v0')
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]

    env.env.seed(seed)
    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = entropy_coeff
    dynamic_coeff = dynamic_coeff
    # env_id = 'ContinuousSpaceMaze{}_{}_RB{}_entropy_{}__Normalize'.format(goal[0], goal[1], max_replay_buffer_size, entropy_coeff)
    env_id = 'MountainCarContinuous_RB1e6_entropy{}_epoch{}__Normalize_uniform'.format(entropy_coeff, n_epochs)
    env_id = env_id + '_dynamicCoeff' if dynamic_coeff else env_id

    os.makedirs(root_dir, exist_ok=True)
    env_dir = os.path.join(root_dir, env_id)
    os.makedirs(env_dir, exist_ok=True)
    current_log_dir = os.path.join(env_dir, 'seed{}'.format(seed))
    mylogger.make_log_dir(current_log_dir)

    # env_id = 'Test'

    print(env_id)
    print('environment set done')

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff,
        clip_norm=clip_norm,
    )

    # name = env_id + datetime.now().strftime("-%m%d-%Hh-%Mm-%ss")
    # mylogger.make_log_dir(name)

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Ejemplo n.º 7
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())
Ejemplo n.º 8
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
Ejemplo n.º 9
0
ENVIRONMENTS = {
    'swimmer': {
        'default': SwimmerEnv,
        'multi-direction': MultiDirectionSwimmerEnv,
    },
    'ant': {
        'default': AntEnv,
        'multi-direction': MultiDirectionAntEnv,
        'cross-maze': CrossMazeAntEnv
    },
    'humanoid': {
        'default': HumanoidEnv,
        'multi-direction': MultiDirectionHumanoidEnv,
    },
    'hopper': {
        'default': lambda: normalize(GymEnv('Hopper-v1'))
    },
    'half-cheetah': {
        'default': lambda: normalize(GymEnv('HalfCheetah-v2'))
    },
    'walker': {
        'default': lambda: normalize(GymEnv('Walker2d-v1'))
    },
}

DEFAULT_DOMAIN = DEFAULT_ENV = 'swimmer'
AVAILABLE_DOMAINS = set(ENVIRONMENTS.keys())
AVAILABLE_TASKS = set(y for x in ENVIRONMENTS.values() for y in x.keys())

def parse_args():
    parser = argparse.ArgumentParser()
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))

    # define the data between 0 and 20
    NUM_VALS = 20
    x = np.random.uniform(0, NUM_VALS, size=NUM_VALS)
    y = np.random.uniform(0, NUM_VALS, size=NUM_VALS)

    # define the color chart between 2 and 10 using the 'autumn_r' colormap, so
    #   y <= 2  is yellow
    #   y >= 10 is red
    #   2 < y < 10 is between from yellow to red, according to its value
    COL = MplColorHelper('autumn_r', 2, 10)

    scat = ax.scatter(x, y, s=300, c=COL.get_rgb(y))
    ax.set_title('Well defined discrete colors')
    plt.show()


if __name__ == '__main__':
    import gym
    import environments
    from sac.envs import GymEnv
    from time import sleep
    env = GymEnv('MountainCarContinuousColor-v0')

    env.reset()
    env.env.render()
    env.step(0.5)
    env.env.render()
    sleep(3)
Ejemplo n.º 11
0
    MultiDirectionHumanoidEnv,
    CrossMazeAntEnv,
)

from sac.misc.instrument import run_sac_experiment
from sac.misc.utils import timestamp, unflatten
from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy
from sac.misc.sampler import SimpleSampler
from sac.replay_buffers import SimpleReplayBuffer
from sac.value_functions import NNQFunction, NNVFunction
from sac.preprocessors import MLPPreprocessor
from examples.variants import parse_domain_and_task, get_variants

ENVIRONMENTS = {
    'swimmer-gym': {
        'default': lambda: GymEnv('Swimmer-v1'),
    },
    'swimmer-rllab': {
        'default': SwimmerEnv,
        'multi-direction': MultiDirectionSwimmerEnv,
    },
    'ant': {
        'default': lambda: GymEnv('Ant-v1'),
        'multi-direction': MultiDirectionAntEnv,
        'cross-maze': CrossMazeAntEnv
    },
    'humanoid-gym': {
        'default': lambda: GymEnv('Humanoid-v2')
    },
    'humanoid-rllab': {
        'default': HumanoidEnv,
Ejemplo n.º 12
0
    CrossMazeAntEnv,
)

from sac.misc.instrument import run_sac_experiment
from sac.misc.utils import timestamp, unflatten
from sac.policies import GaussianPolicy, LatentSpacePolicy, GMMPolicy, UniformPolicy
from sac.misc.sampler import SimpleSampler
from sac.replay_buffers import SimpleReplayBuffer
from sac.value_functions import NNQFunction, NNVFunction
from sac.preprocessors import MLPPreprocessor
from examples.variants import parse_domain_and_task, get_variants
import copy

ENVIRONMENTS = {
    'swimmer-gym': {
        'default': lambda: GymEnv('Swimmer-v2'),
    },
    'swimmer-rllab': {
        'default': SwimmerEnv,
        'multi-direction': MultiDirectionSwimmerEnv,
    },
    'ant': {
        'default': lambda: GymEnv('Ant-v2'),
        'multi-direction': MultiDirectionAntEnv,
        'cross-maze': CrossMazeAntEnv
    },
    'humanoid-gym': {
        'default': lambda: GymEnv('Humanoid-v2')
    },
    'humanoid-rllab': {
        'default': HumanoidEnv,
Ejemplo n.º 13
0
def main(root_dir):
    # tf.set_random_seed(seed=seed)
    # env = GymEnv('MountainCarContinuous-v0')
    env = GymEnv('MountainCarContinuousColor-v0')

    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}

    # TODO Normalize or not
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = 0.
    dynamic_coeff = True

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=10,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff
    )


    algorithm._sess.run(tf.global_variables_initializer())


    # TODO Normalize or not
    # Currently only MountainCar is available
    with algorithm._sess.as_default():
        model_file = os.path.join(root_dir, 'model')
        algorithm._saver.restore(algorithm._sess, model_file)

        for i in range(1):
            obs = env.reset()
            env.env.render()
            sleep(4.0)
            traj = [obs]
            done = False

            while not done:
                env.env.render()
                action = algorithm.policy.get_action(obs.flatten())
                obs, rew, done, _ = env.step(action)
                traj.append(obs.flatten())

            knack, knack_kurtosis = sub_goal_detect(algorithm, traj)
            idxs = np.argsort(knack_kurtosis)
            # idxs = np.argsort(knack)
            print(idxs[::-1])

            COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis))
            for j, s in enumerate(traj):
                env.env.state = np.array(traj[j])
                rgba = COL.get_rgb(knack_kurtosis[j])
                env.env.render(car_rgba=rgba)
            sleep(1.0)

            for idx in idxs[::-1]:
                obs = env.reset()
                env.env.state = np.array(traj[0])
                rgba = COL.get_rgb(knack_kurtosis[0])
                env.env.render(car_rgba=rgba)
                for j in range(idx+1):
                    env.env.state = np.array(traj[j])
                    rgba = COL.get_rgb(knack_kurtosis[j])

                    # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0))
                    env.env.render(car_rgba=rgba)
                sleep(0.5)