Beispiel #1
0
 def init_rank(self, rank):
     self.rank = rank
     if self.set_cpu_affinity:
         self._set_affinity(rank)
     self.baseline.init_rank(rank)
     self.optimizer.init_rank(rank)
     if self.exemplar is not None:
         self.exemplar.init_rank(rank)
     seed = ext.get_seed()
     if seed is None:
         # NOTE: Not sure if this is a good source for seed?
         seed = int(1e6 * np.random.rand())
     ext.set_seed(seed + rank)
Beispiel #2
0
    def __init__(self,
                 policy,
                 env,
                 n_envs,
                 replay_pool_size,
                 max_path_length,
                 sampling_method,
                 save_rollouts=False,
                 save_rollouts_observations=True,
                 save_env_infos=False,
                 env_str=None,
                 replay_pool_params={}):
        self._policy = policy
        self._n_envs = n_envs

        assert (self._n_envs == 1)  # b/c policy reset

        self._replay_pools = [
            RNNCriticReplayPool(
                env.spec,
                env.horizon,
                policy.N,
                policy.gamma,
                replay_pool_size // n_envs,
                obs_history_len=policy.obs_history_len,
                sampling_method=sampling_method,
                save_rollouts=save_rollouts,
                save_rollouts_observations=save_rollouts_observations,
                save_env_infos=save_env_infos,
                replay_pool_params=replay_pool_params) for _ in range(n_envs)
        ]

        try:
            envs = [
                pickle.loads(pickle.dumps(env)) for _ in range(self._n_envs)
            ] if self._n_envs > 1 else [env]
        except:
            envs = [create_env(env_str) for _ in range(self._n_envs)
                    ] if self._n_envs > 1 else [env]
        ### need to seed each environment if it is GymEnv
        seed = get_seed()
        if seed is not None and isinstance(utils.inner_env(env), GymEnv):
            for i, env in enumerate(envs):
                utils.inner_env(env).env.seed(seed + i)
        self._vec_env = VecEnvExecutor(envs=envs,
                                       max_path_length=max_path_length)
        self._curr_observations = self._vec_env.reset()
Beispiel #3
0
    def _graph_setup(self):
        ### create session and graph
        tf_sess = tf.get_default_session()
        if tf_sess is None:
            tf_sess, tf_graph = MACPolicy.create_session_and_graph(
                gpu_device=self._gpu_device, gpu_frac=self._gpu_frac)
        tf_graph = tf_sess.graph

        with tf_sess.as_default(), tf_graph.as_default():
            if ext.get_seed() is not None:
                ext.set_seed(ext.get_seed())

            ### create input output placeholders
            tf_obs_ph, tf_actions_ph, tf_dones_ph, tf_rewards_ph, tf_obs_target_ph, \
                tf_test_es_ph_dict, tf_episode_timesteps_ph = self._graph_input_output_placeholders()
            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')

            ### policy
            policy_scope = 'policy'
            with tf.variable_scope(policy_scope):
                ### create preprocess placeholders
                tf_preprocess = self._graph_preprocess_placeholders()
                ### process obs to lowd
                tf_obs_lowd = self._graph_obs_to_lowd(tf_obs_ph,
                                                      tf_preprocess,
                                                      is_training=True)
                ### create training policy
                tf_train_values, tf_train_values_softmax, _, _ = \
                    self._graph_inference(tf_obs_lowd, tf_actions_ph[:, :self._H, :],
                                          self._values_softmax, tf_preprocess, is_training=True)

            with tf.variable_scope(policy_scope, reuse=True):
                tf_train_values_test, tf_train_values_softmax_test, _, _ = \
                    self._graph_inference(tf_obs_lowd, tf_actions_ph[:, :self._get_action_test['H'], :],
                                          self._values_softmax, tf_preprocess, is_training=False)
                tf_get_value = tf.reduce_sum(tf_train_values_softmax_test *
                                             tf_train_values_test,
                                             reduction_indices=1)

            ### action selection
            tf_get_action, tf_get_action_value, tf_get_action_reset_ops = \
                self._graph_get_action(tf_obs_ph, self._get_action_test,
                                       policy_scope, True, policy_scope, True,
                                       tf_episode_timesteps_ph)
            ### exploration strategy and logprob
            tf_get_action_explore = self._graph_get_action_explore(
                tf_get_action, tf_test_es_ph_dict)

            ### get policy variables
            tf_policy_vars = sorted(tf.get_collection(
                xplatform.global_variables_collection_name(),
                scope=policy_scope),
                                    key=lambda v: v.name)
            tf_trainable_policy_vars = sorted(tf.get_collection(
                xplatform.trainable_variables_collection_name(),
                scope=policy_scope),
                                              key=lambda v: v.name)

            ### create target network
            if self._use_target:
                target_scope = 'target' if self._separate_target_params else 'policy'
                ### action selection
                tf_obs_target_ph_packed = xplatform.concat([
                    tf_obs_target_ph[:, h - self._obs_history_len:h, :]
                    for h in range(self._obs_history_len,
                                   self._obs_history_len + self._N + 1)
                ], 0)
                tf_target_get_action, tf_target_get_action_values, _ = self._graph_get_action(
                    tf_obs_target_ph_packed,
                    self._get_action_target,
                    scope_select=policy_scope,
                    reuse_select=True,
                    scope_eval=target_scope,
                    reuse_eval=(target_scope == policy_scope),
                    tf_episode_timesteps_ph=None)  # TODO would need to fill in

                tf_target_get_action_values = tf.transpose(
                    tf.reshape(tf_target_get_action_values,
                               (self._N + 1, -1)))[:, 1:]
            else:
                tf_target_get_action_values = tf.zeros(
                    [tf.shape(tf_train_values)[0], self._N])

            ### update target network
            if self._use_target and self._separate_target_params:
                tf_policy_vars_nobatchnorm = list(
                    filter(
                        lambda v: 'biased' not in v.name and 'local_step'
                        not in v.name, tf_policy_vars))
                tf_target_vars = sorted(tf.get_collection(
                    xplatform.global_variables_collection_name(),
                    scope=target_scope),
                                        key=lambda v: v.name)
                assert (len(tf_policy_vars_nobatchnorm) == len(tf_target_vars))
                tf_update_target_fn = []
                for var, var_target in zip(tf_policy_vars_nobatchnorm,
                                           tf_target_vars):
                    assert (var.name.replace(policy_scope,
                                             '') == var_target.name.replace(
                                                 target_scope, ''))
                    tf_update_target_fn.append(var_target.assign(var))
                tf_update_target_fn = tf.group(*tf_update_target_fn)
            else:
                tf_target_vars = None
                tf_update_target_fn = None

            ### optimization
            tf_cost, tf_mse = self._graph_cost(tf_train_values,
                                               tf_train_values_softmax,
                                               tf_rewards_ph, tf_dones_ph,
                                               tf_target_get_action_values)
            tf_opt, tf_lr_ph = self._graph_optimize(tf_cost,
                                                    tf_trainable_policy_vars)

            ### initialize
            self._graph_init_vars(tf_sess)

        ### what to return
        return {
            'sess': tf_sess,
            'graph': tf_graph,
            'obs_ph': tf_obs_ph,
            'actions_ph': tf_actions_ph,
            'dones_ph': tf_dones_ph,
            'rewards_ph': tf_rewards_ph,
            'obs_target_ph': tf_obs_target_ph,
            'test_es_ph_dict': tf_test_es_ph_dict,
            'episode_timesteps_ph': tf_episode_timesteps_ph,
            'preprocess': tf_preprocess,
            'get_value': tf_get_value,
            'get_action': tf_get_action,
            'get_action_explore': tf_get_action_explore,
            'get_action_value': tf_get_action_value,
            'get_action_reset_ops': tf_get_action_reset_ops,
            'update_target_fn': tf_update_target_fn,
            'cost': tf_cost,
            'mse': tf_mse,
            'opt': tf_opt,
            'lr_ph': tf_lr_ph,
            'policy_vars': tf_policy_vars,
            'target_vars': tf_target_vars
        }