def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, ): """ Initialized the policy. :param seed: Random seed to use for TensorFlow. :param brain: The corresponding Brain for this policy. :param trainer_settings: The trainer parameters. :param model_path: Where to load/save the model. :param load: If True, load model from model_path. Otherwise, create new model. """ self.m_size = 0 self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings # for ghost trainer save/load snapshots self.assign_phs: List[tf.Tensor] = [] self.assign_ops: List[tf.Operation] = [] self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} self.sequence_length = 1 self.seed = seed self.behavior_spec = behavior_spec self.act_size = (list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size]) self.vec_obs_size = sum(shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1) self.vis_obs_size = sum(1 for shape in behavior_spec.observation_shapes if len(shape) == 3) self.use_recurrent = self.network_settings.memory is not None self.memory_dict: Dict[str, np.ndarray] = {} self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.normalize = self.network_settings.normalize self.use_continuous_act = behavior_spec.is_action_continuous() self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self.keep_checkpoints = self.trainer_settings.keep_checkpoints self.graph = tf.Graph() self.sess = tf.Session(config=tf_utils.generate_session_config(), graph=self.graph) self.saver: Optional[tf.Operation] = None self.seed = seed if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length self._initialize_tensorflow_references() self.load = load
def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ): self.behavior_spec = behavior_spec self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings self.seed = seed self.act_size = ( list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size] ) self.vec_obs_size = sum( shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1 ) self.vis_obs_size = sum( 1 for shape in behavior_spec.observation_shapes if len(shape) == 3 ) self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self._keep_checkpoints = self.trainer_settings.keep_checkpoints self.use_continuous_act = behavior_spec.is_action_continuous() self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.memory_dict: Dict[str, np.ndarray] = {} self.normalize = trainer_settings.network_settings.normalize self.use_recurrent = self.network_settings.memory is not None self.load = load self.h_size = self.network_settings.hidden_units num_layers = self.network_settings.num_layers if num_layers < 1: num_layers = 1 self.num_layers = num_layers self.vis_encode_type = self.network_settings.vis_encode_type self.tanh_squash = tanh_squash self.reparameterize = reparameterize self.condition_sigma_on_obs = condition_sigma_on_obs self.m_size = 0 self.sequence_length = 1 if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length # Non-exposed parameters; these aren't exposed because they don't have a # good explanation and usually shouldn't be touched. self.log_std_min = -20 self.log_std_max = 2
def test_action_generator(): # Continuous action_len = 30 specs = BehaviorSpec( observation_shapes=[(5, )], action_type=ActionType.CONTINUOUS, action_shape=action_len, ) zero_action = specs.create_empty_action(4) assert np.array_equal(zero_action, np.zeros((4, action_len), dtype=np.float32)) random_action = specs.create_random_action(4) assert random_action.dtype == np.float32 assert random_action.shape == (4, action_len) assert np.min(random_action) >= -1 assert np.max(random_action) <= 1 # Discrete action_shape = (10, 20, 30) specs = BehaviorSpec( observation_shapes=[(5, )], action_type=ActionType.DISCRETE, action_shape=action_shape, ) zero_action = specs.create_empty_action(4) assert np.array_equal(zero_action, np.zeros((4, len(action_shape)), dtype=np.int32)) random_action = specs.create_random_action(4) assert random_action.dtype == np.int32 assert random_action.shape == (4, len(action_shape)) assert np.min(random_action) >= 0 for index, branch_size in enumerate(action_shape): assert np.max(random_action[:, index]) < branch_size
def create_behavior_spec(num_visual, num_vector, vector_size): behavior_spec = BehaviorSpec( [(84, 84, 3)] * int(num_visual) + [(vector_size, )] * int(num_vector), ActionType.DISCRETE, (1, ), ) return behavior_spec
def test_empty_terminal_steps(): specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )], action_spec=ActionSpec.create_continuous(3)) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def test_empty_decision_steps(): specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )], action_spec=ActionSpec.create_continuous(3)) ds = DecisionSteps.empty(specs) assert len(ds.obs) == 2 assert ds.obs[0].shape == (0, 3, 2) assert ds.obs[1].shape == (0, 5)
def behavior_spec_from_proto(brain_param_proto: BrainParametersProto, agent_info: AgentInfoProto) -> BehaviorSpec: """ Converts brain parameter and agent info proto to BehaviorSpec object. :param brain_param_proto: protobuf object. :param agent_info: protobuf object. :return: BehaviorSpec object. """ observation_shape = [tuple(obs.shape) for obs in agent_info.observations] dim_props = [ tuple(DimensionProperty(dim) for dim in obs.dimension_properties) for obs in agent_info.observations ] sensor_specs = [ SensorSpec(obs_shape, dim_p) for obs_shape, dim_p in zip(observation_shape, dim_props) ] # proto from communicator < v1.3 does not set action spec, use deprecated fields instead if (brain_param_proto.action_spec.num_continuous_actions == 0 and brain_param_proto.action_spec.num_discrete_actions == 0): if brain_param_proto.vector_action_space_type_deprecated == 1: action_spec = ActionSpec( brain_param_proto.vector_action_size_deprecated[0], ()) else: action_spec = ActionSpec( 0, tuple(brain_param_proto.vector_action_size_deprecated)) else: action_spec_proto = brain_param_proto.action_spec action_spec = ActionSpec( action_spec_proto.num_continuous_actions, tuple(branch for branch in action_spec_proto.discrete_branch_sizes), ) return BehaviorSpec(sensor_specs, action_spec)
def create_mock_group_spec( number_visual_observations=0, vector_action_space_type="continuous", vector_observation_space_size=3, vector_action_space_size=None, ): """ Creates a mock BrainParameters object with parameters. """ # Avoid using mutable object as default param act_type = ActionType.DISCRETE if vector_action_space_type == "continuous": act_type = ActionType.CONTINUOUS if vector_action_space_size is None: vector_action_space_size = 2 else: vector_action_space_size = vector_action_space_size[0] else: if vector_action_space_size is None: vector_action_space_size = (2, ) else: vector_action_space_size = tuple(vector_action_space_size) obs_shapes = [(vector_observation_space_size, )] for _ in range(number_visual_observations): obs_shapes += [(8, 8, 3)] return BehaviorSpec(obs_shapes, act_type, vector_action_space_size)
def create_mock_group_spec( number_visual_observations=0, vector_action_space_type="continuous", vector_observation_space_size=3, vector_action_space_size=None, ): """ Creates a mock BrainParameters object with parameters. """ # Avoid using mutable object as default param if vector_action_space_type == "continuous": if vector_action_space_size is None: vector_action_space_size = 2 else: vector_action_space_size = vector_action_space_size[0] action_spec = ActionSpec.create_continuous(vector_action_space_size) else: if vector_action_space_size is None: vector_action_space_size = (2, ) else: vector_action_space_size = tuple(vector_action_space_size) action_spec = ActionSpec.create_discrete(vector_action_space_size) obs_shapes = [(vector_observation_space_size, )] for _ in range(number_visual_observations): obs_shapes += [(8, 8, 3)] obs_spec = create_observation_specs_with_shapes(obs_shapes) return BehaviorSpec(obs_spec, action_spec)
def test_batched_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, spec) for agent_id in range(n_agents): if agent_id in decision_steps: # we set the reward equal to the agent id in generate_list_agent_proto assert decision_steps[agent_id].reward == agent_id elif agent_id in terminal_steps: assert terminal_steps[agent_id].reward == agent_id else: raise Exception("Missing agent from the steps") # We sort the AgentId since they are split between DecisionSteps and TerminalSteps combined_agent_id = list(decision_steps.agent_id) + list( terminal_steps.agent_id) combined_agent_id.sort() assert combined_agent_id == list(range(n_agents)) for agent_id in range(n_agents): assert (agent_id in terminal_steps) == (agent_id % 2 == 0) if agent_id in terminal_steps: assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0) assert decision_steps.obs[0].shape[1] == shapes[0][0] assert decision_steps.obs[1].shape[1] == shapes[1][0] assert terminal_steps.obs[0].shape[1] == shapes[0][0] assert terminal_steps.obs[1].shape[1] == shapes[1][0]
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_observations = [ np.random.normal(size=shape) for shape in behavior_spec.observation_shapes ] next_observations = [ np.random.normal(size=shape) for shape in behavior_spec.observation_shapes ] action = behavior_spec.create_random_action(1)[0, :] for _ in range(number): curr_split_obs = SplitObservations.from_observations(curr_observations) next_split_obs = SplitObservations.from_observations(next_observations) for i, _ in enumerate(curr_split_obs.visual_observations): buffer["visual_obs%d" % i].append( curr_split_obs.visual_observations[i]) buffer["next_visual_obs%d" % i].append( next_split_obs.visual_observations[i]) buffer["vector_obs"].append(curr_split_obs.vector_observations) buffer["next_vector_in"].append(next_split_obs.vector_observations) buffer["actions"].append(action) buffer["done"].append(np.zeros(1, dtype=np.float32)) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) return buffer
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), basic_params()) # Doesn't really matter what this is dummy_groupspec = BehaviorSpec([(1, )], "continuous", 1) no_agent_step = DecisionSteps.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 10) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_empty_terminal_steps(): specs = BehaviorSpec( sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]), action_spec=ActionSpec.create_continuous(3), ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(10)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_batched_step_result_from_proto_raises_on_infinite(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes, infinite_rewards=True) with pytest.raises(RuntimeError): steps_from_proto(ap_list, behavior_spec)
def setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8 ): behavior_spec = BehaviorSpec( [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)], ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS, tuple(vector_action_space) if use_discrete else vector_action_space, ) return behavior_spec
def create_steps_from_behavior_spec( behavior_spec: BehaviorSpec, num_agents: int = 1) -> Tuple[DecisionSteps, TerminalSteps]: return create_mock_steps( num_agents=num_agents, observation_shapes=behavior_spec.observation_shapes, action_shape=behavior_spec.action_shape, discrete=behavior_spec.is_action_discrete(), )
def test_empty_decision_steps(): specs = BehaviorSpec( sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]), action_spec=ActionSpec.create_continuous(3), ) ds = DecisionSteps.empty(specs) assert len(ds.obs) == 2 assert ds.obs[0].shape == (0, 3, 2) assert ds.obs[1].shape == (0, 5)
def test_batched_step_result_from_proto_raises_on_nan(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3) ap_list = generate_list_agent_proto(n_agents, shapes, nan_observations=True) with pytest.raises(RuntimeError): steps_from_proto(ap_list, behavior_spec)
def test_empty_decision_steps(): specs = BehaviorSpec( observation_shapes=[(3, 2), (5, )], action_type=ActionType.CONTINUOUS, action_shape=3, ) ds = DecisionSteps.empty(specs) assert len(ds.obs) == 2 assert ds.obs[0].shape == (0, 3, 2) assert ds.obs[1].shape == (0, 5)
def test_batched_step_result_from_proto_raises_on_nan(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes, nan_observations=True) with pytest.raises(RuntimeError): steps_from_proto(ap_list, behavior_spec)
def test_empty_terminal_steps(): specs = BehaviorSpec( observation_shapes=[(3, 2), (5, )], action_type=ActionType.CONTINUOUS, action_shape=3, ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def create_mock_steps( num_agents: int = 1, num_vector_observations: int = 0, num_vis_observations: int = 0, action_shape: List[int] = None, discrete: bool = False, done: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :int num_vector_observations: Number of "observations" in your observation space :int num_vis_observations: Number of "observations" in your observation space :int num_vector_acts: Number of actions in your action space :bool discrete: Whether or not action space is discrete :bool done: Whether all the agents in the batch are done """ if action_shape is None: action_shape = [2] obs_list = [] for _ in range(num_vis_observations): obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32)) if num_vector_observations > 1: obs_list.append( np.array(num_agents * [num_vector_observations * [1]], dtype=np.float32)) action_mask = None if discrete: action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_shape ] reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) behavior_spec = BehaviorSpec( [(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)], ActionType.DISCRETE if discrete else ActionType.CONTINUOUS, action_shape if discrete else action_shape[0], ) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps(obs_list, reward, interrupted, agent_id), ) else: return ( DecisionSteps(obs_list, reward, agent_id, action_mask), TerminalSteps.empty(behavior_spec), )
def setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8 ): if use_discrete: action_spec = ActionSpec.create_discrete(tuple(vector_action_space)) else: action_spec = ActionSpec.create_continuous(vector_action_space) observation_shapes = [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)] obs_spec = create_observation_specs_with_shapes(observation_shapes) behavior_spec = BehaviorSpec(obs_spec, action_spec) return behavior_spec
def test_mismatch_observations_raise_in_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) # Hack an observation to be larger, we should get an exception ap_list[0].observations[0].shape[0] += 1 ap_list[0].observations[0].float_data.data.append(0.42) with pytest.raises(UnityObservationException): steps_from_proto(ap_list, spec)
def test_action_masking_discrete_1(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (10, )) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 1 assert masks[0].shape == (n_agents / 2, 10) assert masks[0][0, 0]
def test_action_masking_discrete_2(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((2, 2, 6))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 3 assert masks[0].shape == (n_agents / 2, 2) assert masks[1].shape == (n_agents / 2, 2) assert masks[2].shape == (n_agents / 2, 6) assert masks[0][0, 0]
def __init__( self, brain_names, step_size=STEP_SIZE, num_visual=0, num_vector=1, num_var_len=0, vis_obs_size=VIS_OBS_SIZE, vec_obs_size=OBS_SIZE, var_len_obs_size=VAR_LEN_SIZE, action_sizes=(1, 0), ): super().__init__() self.num_visual = num_visual self.num_vector = num_vector self.num_var_len = num_var_len self.vis_obs_size = vis_obs_size self.vec_obs_size = vec_obs_size self.var_len_obs_size = var_len_obs_size continuous_action_size, discrete_action_size = action_sizes discrete_tuple = tuple(2 for _ in range(discrete_action_size)) action_spec = ActionSpec(continuous_action_size, discrete_tuple) self.total_action_size = (continuous_action_size + discrete_action_size ) # to set the goals/positions self.action_spec = action_spec self.behavior_spec = BehaviorSpec(self._make_observation_specs(), action_spec) self.action_spec = action_spec self.names = brain_names self.positions: Dict[str, List[float]] = {} self.step_count: Dict[str, float] = {} self.random = random.Random(str(self.behavior_spec)) self.goal: Dict[str, int] = {} self.action = {} self.rewards: Dict[str, float] = {} self.final_rewards: Dict[str, List[float]] = {} self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {} self.agent_id: Dict[str, int] = {} self.step_size = step_size # defines the difficulty of the test # Allow to be used as a UnityEnvironment during tests self.academy_capabilities = None for name in self.names: self.agent_id[name] = 0 self.goal[name] = self.random.choice([-1, 1]) self.rewards[name] = 0 self.final_rewards[name] = [] self._reset_agent(name) self.action[name] = None self.step_result[name] = None
def test_action_masking_discrete(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_discrete((7, 3))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 2 assert masks[0].shape == (n_agents / 2, 7) # half agents are done assert masks[1].shape == (n_agents / 2, 3) # half agents are done assert masks[0][0, 0] assert not masks[1][0, 0] assert masks[1][0, 1]