def read_variable_values(self, variables): # For test compatibility. if isinstance(variables, dict): ret = {} for name, var in variables.items(): ret[name] = Component.read_variable(var) return ret elif isinstance(variables, list): return [Component.read_variable(var) for var in variables] else: # Attempt to read as single var. return Component.read_variable(variables)
def test_copying_a_component(self): # Flatten a simple 2x2 FloatBox to (4,). space = FloatBox(shape=(2, 2), add_batch_rank=False) flatten_orig = ReShape(flatten=True, scope="A") flatten_copy = flatten_orig.copy(scope="B") container = Component(flatten_orig, flatten_copy) @rlgraph_api(component=container) def flatten1(self, input_): return self.sub_components["A"].apply(input_) @rlgraph_api(component=container) def flatten2(self, input_): return self.sub_components["B"].apply(input_) test = ComponentTest(component=container, input_spaces=dict(input_=space)) input_ = dict(input1=np.array([[0.5, 2.0], [1.0, 2.0]]), input2=np.array([[1.0, 2.0], [3.0, 4.0]])) expected = dict(output1=np.array([0.5, 2.0, 1.0, 2.0]), output2=np.array([1.0, 2.0, 3.0, 4.0])) for i in range_(1, 3): test.test(("flatten" + str(i), input_["input" + str(i)]), expected_outputs=expected["output" + str(i)])
def test_sync_functionality(self): # Two Components, one with Synchronizable dropped in: # A: Can only push out values. # B: To be synced by A's values. sync_from = MyCompWithVars(scope="sync-from") sync_to = MyCompWithVars(initializer1=8.0, initializer2=7.0, scope="sync-to", synchronizable=True) # Create a dummy test component that contains our two Synchronizables. container = Component(name="container") container.add_components(sync_from, sync_to) @rlgraph_api(component=container) def execute_sync(self): values_ = sync_from._variables() return sync_to.sync(values_) test = ComponentTest(component=container) # Test syncing the variable from->to and check them before and after the sync. # Before the sync. test.variable_test( sync_to.get_variables(VARIABLE_NAMES), { "sync-to/" + VARIABLE_NAMES[0]: np.full(shape=sync_from.space.shape, fill_value=8.0), "sync-to/" + VARIABLE_NAMES[1]: np.full(shape=sync_from.space.shape, fill_value=7.0) }) # Now sync and re-check. test.test("execute_sync", expected_outputs=None) # After the sync. test.variable_test( sync_to.get_variables(VARIABLE_NAMES), { "sync-to/" + VARIABLE_NAMES[0]: np.zeros(shape=sync_from.space.shape), "sync-to/" + VARIABLE_NAMES[1]: np.ones(shape=sync_from.space.shape) })
def test_sync_socket_between_2_identical_comps_that_have_vars_only_in_their_sub_comps( self): """ Similar to the Policy scenario, where the Policy Component owns a NeuralNetwork (which has vars) and has to be synced with other Policies. """ # Create 2x: A custom Component (with vars) that holds another Component (with vars). # Then sync between them. comp1 = MyCompWithVars(scope="A") comp1.add_components(MyCompWithVars(scope="sub-of-A-with-vars")) comp2_writable = MyCompWithVars(scope="B", initializer1=3.0, initializer2=4.2, synchronizable=True) comp2_writable.add_components( MyCompWithVars(scope="sub-of-B-with-vars", initializer1=5.0, initializer2=6.2)) container = Component(comp1, comp2_writable, scope="container") @rlgraph_api(component=container) def execute_sync(self): values_ = comp1._variables() return comp2_writable.sync(values_) test = ComponentTest(component=container) # Before the sync. test.variable_test( comp2_writable.get_variables([ "container/B/variable_to_sync1", "container/B/variable_to_sync2", "container/B/sub-of-B-with-vars/variable_to_sync1", "container/B/sub-of-B-with-vars/variable_to_sync2" ]), { "container/B/variable_to_sync1": np.full( shape=comp1.space.shape, fill_value=3.0, dtype=np.float32), "container/B/variable_to_sync2": np.full( shape=comp1.space.shape, fill_value=4.2, dtype=np.float32), "container/B/sub-of-B-with-vars/variable_to_sync1": np.full( shape=comp1.space.shape, fill_value=5.0, dtype=np.float32), "container/B/sub-of-B-with-vars/variable_to_sync2": np.full( shape=comp1.space.shape, fill_value=6.2, dtype=np.float32) }) # Now sync and re-check. test.test(("execute_sync", None), expected_outputs=None) # After the sync. test.variable_test( comp2_writable.get_variables([ "container/B/variable_to_sync1", "container/B/variable_to_sync2", "container/B/sub-of-B-with-vars/variable_to_sync1", "container/B/sub-of-B-with-vars/variable_to_sync2" ]), { "container/B/variable_to_sync1": np.zeros(shape=comp1.space.shape, dtype=np.float32), "container/B/variable_to_sync2": np.ones(shape=comp1.space.shape, dtype=np.float32), "container/B/sub-of-B-with-vars/variable_to_sync1": np.zeros(shape=comp1.space.shape, dtype=np.float32), "container/B/sub-of-B-with-vars/variable_to_sync2": np.ones(shape=comp1.space.shape, dtype=np.float32) })
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent"): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list): Neural network specification for baseline. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.flat_state_space = self.state_space.flatten() if isinstance( self.state_space, ContainerSpace) else None self.logger.info("Parsed state space definition: {}".format( self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank( False) self.flat_action_space = self.action_space.flatten() if isinstance( self.action_space, ContainerSpace) else None self.logger.info("Parsed action space definition: {}".format( self.action_space)) self.discount = discount self.build_options = {} # The agent's root-Component. self.root_component = Component(name=self.name, nesting_level=0) # Define the input-Spaces: # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict(states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space( self.state_space) self.preprocessing_required = preprocessing_spec is not None and len( preprocessing_spec) > 0 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info( "Parsed preprocessed-state space definition: {}".format( self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. policy_spec = policy_spec or dict() if "network_spec" not in policy_spec: policy_spec["network_spec"] = network_spec if "action_space" not in policy_spec: policy_spec["action_space"] = self.action_space self.policy_spec = policy_spec # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy.from_spec(self.policy_spec) # Done by default. self.policy.add_components(Synchronizable(), expose_apis="sync") # Create non-shared baseline network. self.value_function = None if value_function_spec is not None: self.value_function = ValueFunction( network_spec=value_function_spec) self.value_function.add_components(Synchronizable(), expose_apis="sync") self.vars_merger = ContainerMerger("policy", "vf", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", "vf", scope="variable-container-splitter") else: self.vars_merger = ContainerMerger("policy", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", scope="variable-container-splitter") self.internal_states_space = Space.from_spec(internal_states_space) # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" def factory_(i): if i < 2: return [] return tuple([[] for _ in range(i)]) self.states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.actions_buffer = defaultdict( partial(factory_, len(self.flat_action_space or []))) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: # Save spec in case agent needs to create more optimizers e.g. for baseline. self.optimizer_spec = optimizer_spec self.optimizer = Optimizer.from_spec(optimizer_spec) self.value_function_optimizer = None if self.value_function is not None: if value_function_optimizer_spec is None: vf_optimizer_spec = self.optimizer_spec else: vf_optimizer_spec = value_function_optimizer_spec vf_optimizer_spec["scope"] = "value-function-optimizer" self.value_function_optimizer = Optimizer.from_spec( vf_optimizer_spec) # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec) # type: GraphExecutor
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, action_adapter_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent" ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. action_adapter_spec (Optional[dict,ActionAdapter]): The spec-dict for the ActionAdapter Component or the ActionAdapter object itself. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.logger.info("Parsed state space definition: {}".format(self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank(False) self.logger.info("Parsed action space definition: {}".format(self.action_space)) self.discount = discount # The agent's root-Component. self.root_component = Component(name=self.name) # Define the input-Spaces: # Tag the input-Space to `self.set_policy_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict( states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(self.state_space) self.preprocessing_required = preprocessing_spec is not None and len(preprocessing_spec) > 1 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info("Parsed preprocessed-state space definition: {}".format(self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. self.neural_network = None if network_spec is not None: self.neural_network = NeuralNetwork.from_spec(network_spec) self.action_adapter_spec = action_adapter_spec self.internal_states_space = internal_states_space # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None # The action adapter mapping raw NN output to (shaped) actions. action_adapter_dict = dict(action_space=self.action_space) if self.action_adapter_spec is None: self.action_adapter_spec = action_adapter_dict else: self.action_adapter_spec.update(action_adapter_dict) # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy( network_spec=self.neural_network, action_adapter_spec=self.action_adapter_spec ) self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" self.states_buffer = defaultdict(list) self.actions_buffer = defaultdict(list) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict(list) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) if self.observe_spec["buffer_enabled"]: self.reset_env_buffers() # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: self.optimizer = Optimizer.from_spec(optimizer_spec) #get_optimizer_from_device_strategy( #optimizer_spec, self.execution_spec.get("device_strategy", 'default') # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec ) # type: GraphExecutor