Ejemplo n.º 1
0
    def __init__(
        self,
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
        model_path: str,
        load: bool = False,
    ):
        """
        Initialized the policy.
        :param seed: Random seed to use for TensorFlow.
        :param brain: The corresponding Brain for this policy.
        :param trainer_settings: The trainer parameters.
        :param model_path: Where to load/save the model.
        :param load: If True, load model from model_path. Otherwise, create new model.
        """

        self.m_size = 0
        self.trainer_settings = trainer_settings
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        # for ghost trainer save/load snapshots
        self.assign_phs: List[tf.Tensor] = []
        self.assign_ops: List[tf.Operation] = []

        self.inference_dict: Dict[str, tf.Tensor] = {}
        self.update_dict: Dict[str, tf.Tensor] = {}
        self.sequence_length = 1
        self.seed = seed
        self.behavior_spec = behavior_spec

        self.act_size = (list(behavior_spec.discrete_action_branches)
                         if behavior_spec.is_action_discrete() else
                         [behavior_spec.action_size])
        self.vec_obs_size = sum(shape[0]
                                for shape in behavior_spec.observation_shapes
                                if len(shape) == 1)
        self.vis_obs_size = sum(1 for shape in behavior_spec.observation_shapes
                                if len(shape) == 3)

        self.use_recurrent = self.network_settings.memory is not None
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.num_branches = self.behavior_spec.action_size
        self.previous_action_dict: Dict[str, np.array] = {}
        self.normalize = self.network_settings.normalize
        self.use_continuous_act = behavior_spec.is_action_continuous()
        self.model_path = model_path
        self.initialize_path = self.trainer_settings.init_path
        self.keep_checkpoints = self.trainer_settings.keep_checkpoints
        self.graph = tf.Graph()
        self.sess = tf.Session(config=tf_utils.generate_session_config(),
                               graph=self.graph)
        self.saver: Optional[tf.Operation] = None
        self.seed = seed
        if self.network_settings.memory is not None:
            self.m_size = self.network_settings.memory.memory_size
            self.sequence_length = self.network_settings.memory.sequence_length
        self._initialize_tensorflow_references()
        self.load = load
Ejemplo n.º 2
0
def create_steps_from_behavior_spec(
        behavior_spec: BehaviorSpec,
        num_agents: int = 1) -> Tuple[DecisionSteps, TerminalSteps]:
    return create_mock_steps(
        num_agents=num_agents,
        observation_shapes=behavior_spec.observation_shapes,
        action_shape=behavior_spec.action_shape,
        discrete=behavior_spec.is_action_discrete(),
    )
Ejemplo n.º 3
0
    def __init__(
        self,
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
        model_path: str,
        load: bool = False,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ):
        self.behavior_spec = behavior_spec
        self.trainer_settings = trainer_settings
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        self.seed = seed
        self.act_size = (
            list(behavior_spec.discrete_action_branches)
            if behavior_spec.is_action_discrete()
            else [behavior_spec.action_size]
        )
        self.vec_obs_size = sum(
            shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
        )
        self.vis_obs_size = sum(
            1 for shape in behavior_spec.observation_shapes if len(shape) == 3
        )
        self.model_path = model_path
        self.initialize_path = self.trainer_settings.init_path
        self._keep_checkpoints = self.trainer_settings.keep_checkpoints
        self.use_continuous_act = behavior_spec.is_action_continuous()
        self.num_branches = self.behavior_spec.action_size
        self.previous_action_dict: Dict[str, np.array] = {}
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.normalize = trainer_settings.network_settings.normalize
        self.use_recurrent = self.network_settings.memory is not None
        self.load = load
        self.h_size = self.network_settings.hidden_units
        num_layers = self.network_settings.num_layers
        if num_layers < 1:
            num_layers = 1
        self.num_layers = num_layers

        self.vis_encode_type = self.network_settings.vis_encode_type
        self.tanh_squash = tanh_squash
        self.reparameterize = reparameterize
        self.condition_sigma_on_obs = condition_sigma_on_obs

        self.m_size = 0
        self.sequence_length = 1
        if self.network_settings.memory is not None:
            self.m_size = self.network_settings.memory.memory_size
            self.sequence_length = self.network_settings.memory.sequence_length

        # Non-exposed parameters; these aren't exposed because they don't have a
        # good explanation and usually shouldn't be touched.
        self.log_std_min = -20
        self.log_std_max = 2
Ejemplo n.º 4
0
def behavior_spec_to_brain_parameters(
        name: str, behavior_spec: BehaviorSpec) -> BrainParameters:
    vec_size = np.sum([
        shape[0] for shape in behavior_spec.observation_shapes
        if len(shape) == 1
    ])
    vis_sizes = [
        shape for shape in behavior_spec.observation_shapes if len(shape) == 3
    ]
    cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes]
    a_size: List[int] = []
    if behavior_spec.is_action_discrete():
        a_size += list(behavior_spec.discrete_action_branches)
        vector_action_space_type = 0
    else:
        a_size += [behavior_spec.action_size]
        vector_action_space_type = 1
    return BrainParameters(name, int(vec_size), cam_res, a_size, [],
                           vector_action_space_type)
Ejemplo n.º 5
0
def simulate_rollout(
    length: int,
    behavior_spec: BehaviorSpec,
    memory_size: int = 10,
    exclude_key_list: List[str] = None,
) -> AgentBuffer:
    action_space = behavior_spec.action_shape
    is_discrete = behavior_spec.is_action_discrete()

    trajectory = make_fake_trajectory(
        length,
        behavior_spec.observation_shapes,
        action_space=action_space,
        memory_size=memory_size,
        is_discrete=is_discrete,
    )
    buffer = trajectory.to_agentbuffer()
    # If a key_list was given, remove those keys
    if exclude_key_list:
        for key in exclude_key_list:
            if key in buffer:
                buffer.pop(key)
    return buffer
Ejemplo n.º 6
0
def steps_from_proto(
    agent_info_list: Collection[
        AgentInfoProto
    ],  # pylint: disable=unsubscriptable-object
    behavior_spec: BehaviorSpec,
) -> Tuple[DecisionSteps, TerminalSteps]:
    decision_agent_info_list = [
        agent_info for agent_info in agent_info_list if not agent_info.done
    ]
    terminal_agent_info_list = [
        agent_info for agent_info in agent_info_list if agent_info.done
    ]
    decision_obs_list: List[np.ndarray] = []
    terminal_obs_list: List[np.ndarray] = []
    for obs_index, obs_shape in enumerate(behavior_spec.observation_shapes):
        is_visual = len(obs_shape) == 3
        if is_visual:
            obs_shape = cast(Tuple[int, int, int], obs_shape)
            decision_obs_list.append(
                _process_visual_observation(
                    obs_index, obs_shape, decision_agent_info_list
                )
            )
            terminal_obs_list.append(
                _process_visual_observation(
                    obs_index, obs_shape, terminal_agent_info_list
                )
            )
        else:
            decision_obs_list.append(
                _process_vector_observation(
                    obs_index, obs_shape, decision_agent_info_list
                )
            )
            terminal_obs_list.append(
                _process_vector_observation(
                    obs_index, obs_shape, terminal_agent_info_list
                )
            )
    decision_rewards = np.array(
        [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32
    )
    terminal_rewards = np.array(
        [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32
    )

    _raise_on_nan_and_inf(decision_rewards, "rewards")
    _raise_on_nan_and_inf(terminal_rewards, "rewards")

    max_step = np.array(
        [agent_info.max_step_reached for agent_info in terminal_agent_info_list],
        dtype=np.bool,
    )
    decision_agent_id = np.array(
        [agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32
    )
    terminal_agent_id = np.array(
        [agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32
    )
    action_mask = None
    if behavior_spec.is_action_discrete():
        if any(
            [agent_info.action_mask is not None]
            for agent_info in decision_agent_info_list
        ):
            n_agents = len(decision_agent_info_list)
            a_size = np.sum(behavior_spec.discrete_action_branches)
            mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
            for agent_index, agent_info in enumerate(decision_agent_info_list):
                if agent_info.action_mask is not None:
                    if len(agent_info.action_mask) == a_size:
                        mask_matrix[agent_index, :] = [
                            False if agent_info.action_mask[k] else True
                            for k in range(a_size)
                        ]
            action_mask = (1 - mask_matrix).astype(np.bool)
            indices = _generate_split_indices(behavior_spec.discrete_action_branches)
            action_mask = np.split(action_mask, indices, axis=1)
    return (
        DecisionSteps(
            decision_obs_list, decision_rewards, decision_agent_id, action_mask
        ),
        TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id),
    )