def init_rank(self, rank): self.rank = rank if self.set_cpu_affinity: self._set_affinity(rank) self.baseline.init_rank(rank) self.optimizer.init_rank(rank) if self.exemplar is not None: self.exemplar.init_rank(rank) seed = ext.get_seed() if seed is None: # NOTE: Not sure if this is a good source for seed? seed = int(1e6 * np.random.rand()) ext.set_seed(seed + rank)
def __init__(self, policy, env, n_envs, replay_pool_size, max_path_length, sampling_method, save_rollouts=False, save_rollouts_observations=True, save_env_infos=False, env_str=None, replay_pool_params={}): self._policy = policy self._n_envs = n_envs assert (self._n_envs == 1) # b/c policy reset self._replay_pools = [ RNNCriticReplayPool( env.spec, env.horizon, policy.N, policy.gamma, replay_pool_size // n_envs, obs_history_len=policy.obs_history_len, sampling_method=sampling_method, save_rollouts=save_rollouts, save_rollouts_observations=save_rollouts_observations, save_env_infos=save_env_infos, replay_pool_params=replay_pool_params) for _ in range(n_envs) ] try: envs = [ pickle.loads(pickle.dumps(env)) for _ in range(self._n_envs) ] if self._n_envs > 1 else [env] except: envs = [create_env(env_str) for _ in range(self._n_envs) ] if self._n_envs > 1 else [env] ### need to seed each environment if it is GymEnv seed = get_seed() if seed is not None and isinstance(utils.inner_env(env), GymEnv): for i, env in enumerate(envs): utils.inner_env(env).env.seed(seed + i) self._vec_env = VecEnvExecutor(envs=envs, max_path_length=max_path_length) self._curr_observations = self._vec_env.reset()
def _graph_setup(self): ### create session and graph tf_sess = tf.get_default_session() if tf_sess is None: tf_sess, tf_graph = MACPolicy.create_session_and_graph( gpu_device=self._gpu_device, gpu_frac=self._gpu_frac) tf_graph = tf_sess.graph with tf_sess.as_default(), tf_graph.as_default(): if ext.get_seed() is not None: ext.set_seed(ext.get_seed()) ### create input output placeholders tf_obs_ph, tf_actions_ph, tf_dones_ph, tf_rewards_ph, tf_obs_target_ph, \ tf_test_es_ph_dict, tf_episode_timesteps_ph = self._graph_input_output_placeholders() self.global_step = tf.Variable(0, trainable=False, name='global_step') ### policy policy_scope = 'policy' with tf.variable_scope(policy_scope): ### create preprocess placeholders tf_preprocess = self._graph_preprocess_placeholders() ### process obs to lowd tf_obs_lowd = self._graph_obs_to_lowd(tf_obs_ph, tf_preprocess, is_training=True) ### create training policy tf_train_values, tf_train_values_softmax, _, _ = \ self._graph_inference(tf_obs_lowd, tf_actions_ph[:, :self._H, :], self._values_softmax, tf_preprocess, is_training=True) with tf.variable_scope(policy_scope, reuse=True): tf_train_values_test, tf_train_values_softmax_test, _, _ = \ self._graph_inference(tf_obs_lowd, tf_actions_ph[:, :self._get_action_test['H'], :], self._values_softmax, tf_preprocess, is_training=False) tf_get_value = tf.reduce_sum(tf_train_values_softmax_test * tf_train_values_test, reduction_indices=1) ### action selection tf_get_action, tf_get_action_value, tf_get_action_reset_ops = \ self._graph_get_action(tf_obs_ph, self._get_action_test, policy_scope, True, policy_scope, True, tf_episode_timesteps_ph) ### exploration strategy and logprob tf_get_action_explore = self._graph_get_action_explore( tf_get_action, tf_test_es_ph_dict) ### get policy variables tf_policy_vars = sorted(tf.get_collection( xplatform.global_variables_collection_name(), scope=policy_scope), key=lambda v: v.name) tf_trainable_policy_vars = sorted(tf.get_collection( xplatform.trainable_variables_collection_name(), scope=policy_scope), key=lambda v: v.name) ### create target network if self._use_target: target_scope = 'target' if self._separate_target_params else 'policy' ### action selection tf_obs_target_ph_packed = xplatform.concat([ tf_obs_target_ph[:, h - self._obs_history_len:h, :] for h in range(self._obs_history_len, self._obs_history_len + self._N + 1) ], 0) tf_target_get_action, tf_target_get_action_values, _ = self._graph_get_action( tf_obs_target_ph_packed, self._get_action_target, scope_select=policy_scope, reuse_select=True, scope_eval=target_scope, reuse_eval=(target_scope == policy_scope), tf_episode_timesteps_ph=None) # TODO would need to fill in tf_target_get_action_values = tf.transpose( tf.reshape(tf_target_get_action_values, (self._N + 1, -1)))[:, 1:] else: tf_target_get_action_values = tf.zeros( [tf.shape(tf_train_values)[0], self._N]) ### update target network if self._use_target and self._separate_target_params: tf_policy_vars_nobatchnorm = list( filter( lambda v: 'biased' not in v.name and 'local_step' not in v.name, tf_policy_vars)) tf_target_vars = sorted(tf.get_collection( xplatform.global_variables_collection_name(), scope=target_scope), key=lambda v: v.name) assert (len(tf_policy_vars_nobatchnorm) == len(tf_target_vars)) tf_update_target_fn = [] for var, var_target in zip(tf_policy_vars_nobatchnorm, tf_target_vars): assert (var.name.replace(policy_scope, '') == var_target.name.replace( target_scope, '')) tf_update_target_fn.append(var_target.assign(var)) tf_update_target_fn = tf.group(*tf_update_target_fn) else: tf_target_vars = None tf_update_target_fn = None ### optimization tf_cost, tf_mse = self._graph_cost(tf_train_values, tf_train_values_softmax, tf_rewards_ph, tf_dones_ph, tf_target_get_action_values) tf_opt, tf_lr_ph = self._graph_optimize(tf_cost, tf_trainable_policy_vars) ### initialize self._graph_init_vars(tf_sess) ### what to return return { 'sess': tf_sess, 'graph': tf_graph, 'obs_ph': tf_obs_ph, 'actions_ph': tf_actions_ph, 'dones_ph': tf_dones_ph, 'rewards_ph': tf_rewards_ph, 'obs_target_ph': tf_obs_target_ph, 'test_es_ph_dict': tf_test_es_ph_dict, 'episode_timesteps_ph': tf_episode_timesteps_ph, 'preprocess': tf_preprocess, 'get_value': tf_get_value, 'get_action': tf_get_action, 'get_action_explore': tf_get_action_explore, 'get_action_value': tf_get_action_value, 'get_action_reset_ops': tf_get_action_reset_ops, 'update_target_fn': tf_update_target_fn, 'cost': tf_cost, 'mse': tf_mse, 'opt': tf_opt, 'lr_ph': tf_lr_ph, 'policy_vars': tf_policy_vars, 'target_vars': tf_target_vars }