def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, ): """ Initialized the policy. :param seed: Random seed to use for TensorFlow. :param brain: The corresponding Brain for this policy. :param trainer_settings: The trainer parameters. :param model_path: Where to load/save the model. :param load: If True, load model from model_path. Otherwise, create new model. """ self.m_size = 0 self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings # for ghost trainer save/load snapshots self.assign_phs: List[tf.Tensor] = [] self.assign_ops: List[tf.Operation] = [] self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} self.sequence_length = 1 self.seed = seed self.behavior_spec = behavior_spec self.act_size = (list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size]) self.vec_obs_size = sum(shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1) self.vis_obs_size = sum(1 for shape in behavior_spec.observation_shapes if len(shape) == 3) self.use_recurrent = self.network_settings.memory is not None self.memory_dict: Dict[str, np.ndarray] = {} self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.normalize = self.network_settings.normalize self.use_continuous_act = behavior_spec.is_action_continuous() self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self.keep_checkpoints = self.trainer_settings.keep_checkpoints self.graph = tf.Graph() self.sess = tf.Session(config=tf_utils.generate_session_config(), graph=self.graph) self.saver: Optional[tf.Operation] = None self.seed = seed if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length self._initialize_tensorflow_references() self.load = load
def create_steps_from_behavior_spec( behavior_spec: BehaviorSpec, num_agents: int = 1) -> Tuple[DecisionSteps, TerminalSteps]: return create_mock_steps( num_agents=num_agents, observation_shapes=behavior_spec.observation_shapes, action_shape=behavior_spec.action_shape, discrete=behavior_spec.is_action_discrete(), )
def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ): self.behavior_spec = behavior_spec self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings self.seed = seed self.act_size = ( list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size] ) self.vec_obs_size = sum( shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1 ) self.vis_obs_size = sum( 1 for shape in behavior_spec.observation_shapes if len(shape) == 3 ) self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self._keep_checkpoints = self.trainer_settings.keep_checkpoints self.use_continuous_act = behavior_spec.is_action_continuous() self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.memory_dict: Dict[str, np.ndarray] = {} self.normalize = trainer_settings.network_settings.normalize self.use_recurrent = self.network_settings.memory is not None self.load = load self.h_size = self.network_settings.hidden_units num_layers = self.network_settings.num_layers if num_layers < 1: num_layers = 1 self.num_layers = num_layers self.vis_encode_type = self.network_settings.vis_encode_type self.tanh_squash = tanh_squash self.reparameterize = reparameterize self.condition_sigma_on_obs = condition_sigma_on_obs self.m_size = 0 self.sequence_length = 1 if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length # Non-exposed parameters; these aren't exposed because they don't have a # good explanation and usually shouldn't be touched. self.log_std_min = -20 self.log_std_max = 2
def behavior_spec_to_brain_parameters( name: str, behavior_spec: BehaviorSpec) -> BrainParameters: vec_size = np.sum([ shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1 ]) vis_sizes = [ shape for shape in behavior_spec.observation_shapes if len(shape) == 3 ] cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes] a_size: List[int] = [] if behavior_spec.is_action_discrete(): a_size += list(behavior_spec.discrete_action_branches) vector_action_space_type = 0 else: a_size += [behavior_spec.action_size] vector_action_space_type = 1 return BrainParameters(name, int(vec_size), cam_res, a_size, [], vector_action_space_type)
def simulate_rollout( length: int, behavior_spec: BehaviorSpec, memory_size: int = 10, exclude_key_list: List[str] = None, ) -> AgentBuffer: action_space = behavior_spec.action_shape is_discrete = behavior_spec.is_action_discrete() trajectory = make_fake_trajectory( length, behavior_spec.observation_shapes, action_space=action_space, memory_size=memory_size, is_discrete=is_discrete, ) buffer = trajectory.to_agentbuffer() # If a key_list was given, remove those keys if exclude_key_list: for key in exclude_key_list: if key in buffer: buffer.pop(key) return buffer
def steps_from_proto( agent_info_list: Collection[ AgentInfoProto ], # pylint: disable=unsubscriptable-object behavior_spec: BehaviorSpec, ) -> Tuple[DecisionSteps, TerminalSteps]: decision_agent_info_list = [ agent_info for agent_info in agent_info_list if not agent_info.done ] terminal_agent_info_list = [ agent_info for agent_info in agent_info_list if agent_info.done ] decision_obs_list: List[np.ndarray] = [] terminal_obs_list: List[np.ndarray] = [] for obs_index, obs_shape in enumerate(behavior_spec.observation_shapes): is_visual = len(obs_shape) == 3 if is_visual: obs_shape = cast(Tuple[int, int, int], obs_shape) decision_obs_list.append( _process_visual_observation( obs_index, obs_shape, decision_agent_info_list ) ) terminal_obs_list.append( _process_visual_observation( obs_index, obs_shape, terminal_agent_info_list ) ) else: decision_obs_list.append( _process_vector_observation( obs_index, obs_shape, decision_agent_info_list ) ) terminal_obs_list.append( _process_vector_observation( obs_index, obs_shape, terminal_agent_info_list ) ) decision_rewards = np.array( [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32 ) terminal_rewards = np.array( [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32 ) _raise_on_nan_and_inf(decision_rewards, "rewards") _raise_on_nan_and_inf(terminal_rewards, "rewards") max_step = np.array( [agent_info.max_step_reached for agent_info in terminal_agent_info_list], dtype=np.bool, ) decision_agent_id = np.array( [agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32 ) terminal_agent_id = np.array( [agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32 ) action_mask = None if behavior_spec.is_action_discrete(): if any( [agent_info.action_mask is not None] for agent_info in decision_agent_info_list ): n_agents = len(decision_agent_info_list) a_size = np.sum(behavior_spec.discrete_action_branches) mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) for agent_index, agent_info in enumerate(decision_agent_info_list): if agent_info.action_mask is not None: if len(agent_info.action_mask) == a_size: mask_matrix[agent_index, :] = [ False if agent_info.action_mask[k] else True for k in range(a_size) ] action_mask = (1 - mask_matrix).astype(np.bool) indices = _generate_split_indices(behavior_spec.discrete_action_branches) action_mask = np.split(action_mask, indices, axis=1) return ( DecisionSteps( decision_obs_list, decision_rewards, decision_agent_id, action_mask ), TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id), )