def test_update_online(self):
        """
        Tests if joint updates from demo and online memory work.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)
        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=env.state_space,
            action_space=env.action_space
        )
        terminals = BoolBox(add_batch_rank=True)

        # Observe a batch of demos.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.sample(32),
            actions=env.action_space.sample(32),
            rewards=FloatBox().sample(32),
            terminals=terminals.sample(32),
            next_states=agent.preprocessed_state_space.sample(32)
        )

        # Observe a batch of online data.
        agent._observe_graph(
            preprocessed_states=agent.preprocessed_state_space.sample(32),
            actions=env.action_space.sample(32),
            rewards=FloatBox().sample(32),
            internals=[],
            terminals=terminals.sample(32),
            next_states=agent.preprocessed_state_space.sample(32)
        )
        # Call update.
        agent.update()
    def test_insert_demos(self):
        """
        Tests inserting into the demo memory.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)

        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=env.state_space,
            action_space=env.action_space
        )
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Observe a single data point.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.with_batch_rank().sample(1),
            actions=env.action_space.with_batch_rank().sample(1),
            rewards=rewards.sample(1),
            next_states=agent.preprocessed_state_space.with_batch_rank().sample(1),
            terminals=terminals.sample(1),
        )

        # Observe a batch of demos.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.sample(10),
            actions=env.action_space.sample(10),
            rewards=FloatBox().sample(10),
            terminals=terminals.sample(10),
            next_states=agent.preprocessed_state_space.sample(10)
        )
Ejemplo n.º 3
0
    def test_demos_with_container_actions(self):
        # Tests if dqfd can fit a set of states to a set of actions.
        vocab_size = 100
        embed_dim = 128
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10, ))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(low=0,
                                                          high=num_outputs)
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense",
                 units=embed_dim,
                 activation="relu",
                 scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=state_space,
                                    action_space=actions_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Create a set of demos.
        demo_states = agent.preprocessed_state_space.with_batch_rank().sample(
            20)
        demo_actions = actions_space.with_batch_rank().sample(20)
        demo_rewards = rewards.sample(20, fill_value=1.0)
        demo_next_states = agent.preprocessed_state_space.with_batch_rank(
        ).sample(20)
        demo_terminals = terminals.sample(20, fill_value=False)

        # Insert.
        agent.observe_demos(
            preprocessed_states=demo_states,
            actions=demo_actions,
            rewards=demo_rewards,
            next_states=demo_next_states,
            terminals=demo_terminals,
        )

        # Fit demos.
        agent.update_from_demos(num_updates=5000, batch_size=20)

        # Evaluate demos:
        agent_actions = agent.get_action(demo_states,
                                         apply_preprocessing=False,
                                         use_exploration=False)
        recursive_assert_almost_equal(agent_actions, demo_actions)
    def test_update_from_demos(self):
        """
        Tests the separate API method to update from demos.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)
        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=env.state_space,
                                    action_space=env.action_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)
        state_1 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_1 = [1]
        state_2 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_2 = [0]

        # Insert two states with fixed actions and a few random examples.
        for _ in range(10):
            # State with correct action
            agent.observe_demos(
                preprocessed_states=state_1,
                actions=action_1,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )
            agent.observe_demos(
                preprocessed_states=state_2,
                actions=action_2,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )

        # Update.
        agent.update_from_demos(num_updates=100, batch_size=8)

        # Test if fixed states and actions map.
        action = agent.get_action(states=state_1,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_1)

        action = agent.get_action(states=state_2,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_2)
Ejemplo n.º 5
0
    def test_memory_compilation(self):
        # Builds a memory and returns build stats.
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)

        record_space = Dict(states=env.state_space,
                            actions=env.action_space,
                            rewards=float,
                            terminals=BoolBox(),
                            add_batch_rank=True)
        input_spaces = dict(
            # insert: records
            records=record_space,
            # get_records: num_records
            num_records=int,
            # update_records: indices, update
            indices=IntBox(add_batch_rank=True),
            update=FloatBox(add_batch_rank=True))

        input_spaces.pop("num_records")
        memory = MemPrioritizedReplay(capacity=20000, )
        test = ComponentTest(component=memory,
                             input_spaces=input_spaces,
                             auto_build=False)
        return test.build()
Ejemplo n.º 6
0
    def __init__(self, clip_ratio, memory_spec=None, **kwargs):
        """
        Args:
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm.
        """
        super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"),
                                       **kwargs)

        self.train_time_steps = 0

        # PPO uses a ring buffer.
        self.memory = Memory.from_spec(memory_spec)
        self.record_space = Dict(states=self.state_space,
                                 actions=self.action_space,
                                 rewards=float,
                                 terminals=BoolBox(),
                                 add_batch_rank=False)

        self.policy = Policy(network_spec=self.neural_network,
                             action_adapter_spec=None)

        self.merger = DictMerger(output_space=self.record_space)
        splitter_input_space = copy.deepcopy(self.record_space)
        self.splitter = ContainerSplitter(input_space=splitter_input_space)
        self.loss_function = PPOLossFunction(clip_ratio=clip_ratio,
                                             discount=self.discount)

        self.define_graph_api()
        if self.auto_build:
            self._build_graph()
            self.graph_built = True
Ejemplo n.º 7
0
    def test_replay(self):
        """
        Tests individual and chunked insert and sampling performance of replay memory.
        """
        record_space = Dict(states=self.env.state_space,
                            actions=self.env.action_space,
                            reward=float,
                            terminals=BoolBox(),
                            add_batch_rank=True)
        input_spaces = dict(insert_records=record_space, get_records=int)

        memory = ReplayMemory(capacity=self.capacity, next_states=True)
        test = ComponentTest(component=memory,
                             input_spaces=input_spaces,
                             enable_profiler=self.enable_profiler)

        records = [record_space.sample(size=1) for _ in range(self.inserts)]
        start = time.monotonic()
        for record in records:
            test.test(("insert_records", record), expected_outputs=None)
        end = time.monotonic() - start

        tp = len(records) / end
        print('#### Testing Replay memory ####')
        print('Testing insert performance:')
        print(
            'Inserted {} separate records, throughput: {} records/s, total time: {} s'
            .format(len(records), tp, end))

        record_chunks = [
            record_space.sample(size=self.chunk_size)
            for _ in range(self.inserts)
        ]
        start = time.monotonic()
        for chunk in record_chunks:
            test.test(("insert_records", chunk), expected_outputs=None)
        end = time.monotonic() - start

        tp = len(record_chunks) * self.chunk_size / end
        print(
            'Inserted {} record chunks of size {}, throughput: {} records/s, total time: {} s'
            .format(len(record_chunks), self.chunk_size, tp, end))

        print('Testing sample performance:')
        start = time.monotonic()
        for _ in range(self.samples):
            test.test(("get_records", self.sample_batch_size),
                      expected_outputs=None)
        end = time.monotonic() - start
        tp = self.samples / end

        print(
            'Sampled {} batches of size {}, throughput: {} sample-ops/s, total time: {} s'
            .format(self.samples, self.sample_batch_size, tp, end))
    def test_post_processing(self):
        """
        Tests external batch post-processing for the PPO agent.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/ppo_agent_for_pong.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)
        num_samples = 200
        states = agent.preprocessed_state_space.sample(num_samples)
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        sequence_indices_space = BoolBox(add_batch_rank=True)

        # GAE is separately tested, just testing if this API method returns results.
        pg_advantages = agent.post_process(
            dict(states=states,
                 rewards=reward_space.sample(num_samples),
                 terminals=terminal_space.sample(num_samples, fill_value=0),
                 sequence_indices=sequence_indices_space.sample(num_samples,
                                                                fill_value=0)))
Ejemplo n.º 9
0
    def test_container_actions(self):
        # Test container actions with embedding.

        vocab_size = 100
        embed_dim = 128
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10, ))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(low=0,
                                                          high=num_outputs)
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense",
                 units=embed_dim,
                 activation="relu",
                 scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=state_space,
                                    action_space=actions_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.with_batch_rank(
            ).sample(1),
            actions=actions_space.with_batch_rank().sample(1),
            rewards=rewards.sample(1),
            next_states=agent.preprocessed_state_space.with_batch_rank().
            sample(1),
            terminals=terminals.sample(1),
        )
Ejemplo n.º 10
0
    def get_preprocessed_space(self, space):
        # TODO map of allowed conversions in utils?
        if isinstance(space, IntBox):
            if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float"\
                    or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32":
                return FloatBox(shape=space.shape,
                                low=space.low,
                                high=space.high,
                                add_batch_rank=space.has_batch_rank,
                                add_time_rank=space.has_time_rank)
            elif self.to_dtype == "bool":
                if space.low == 0 and space.high == 1:
                    return BoolBox(shape=space.shape,
                                   add_batch_rank=space.has_batch_rank,
                                   add_time_rank=space.has_time_rank)
                else:
                    raise RLGraphError(
                        "ERROR: Conversion from IntBox to BoolBox not allowed if low is not 0 and "
                        "high is not 1.")
        elif isinstance(space, BoolBox):
            if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float" \
                 or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32":
                return FloatBox(shape=space.shape,
                                low=0.0,
                                high=1.0,
                                add_batch_rank=space.has_batch_rank,
                                add_time_rank=space.has_time_rank)
            elif self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype  == "np.int32" or \
                    self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32":
                return IntBox(shape=space.shape,
                              low=0,
                              high=1,
                              add_batch_rank=space.has_batch_rank,
                              add_time_rank=space.has_time_rank)
        elif isinstance(space, FloatBox):
            if self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype  == "np.int32" or \
                 self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32":
                return IntBox(shape=space.shape,
                              low=space.low,
                              high=space.high,
                              add_batch_rank=space.has_batch_rank,
                              add_time_rank=space.has_time_rank)

        # Wrong conversion.
        else:
            raise RLGraphError(
                "ERROR: Space conversion from: {} to type {} not supported".
                format(space, self.to_dtype))

        # No conversion.
        return space
Ejemplo n.º 11
0
    def __init__(self,
                 agent,
                 preprocessing_spec=None,
                 exploration_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="mbpo-wrapper",
                 memory_spec=None):
        self.agent = agent
        super().__init__(state_space=agent.state_space,
                         action_space=agent.action_space,
                         discount=agent.discount,
                         preprocessing_spec=preprocessing_spec,
                         network_spec=None,
                         internal_states_space=None,
                         policy_spec=None,
                         value_function_spec=None,
                         exploration_spec=exploration_spec,
                         execution_spec=execution_spec,
                         optimizer_spec=optimizer_spec,
                         value_function_optimizer_spec=None,
                         observe_spec=observe_spec,
                         update_spec=update_spec,
                         summary_spec=summary_spec,
                         saver_spec=saver_spec,
                         auto_build=auto_build,
                         name=name)

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        float_action_space = self.action_space.with_batch_rank().map(
            mapping=lambda flat_key, space: space.as_one_hot_float_space()
            if isinstance(space, IntBox) else space)

        self.memory = Memory.from_spec(memory_spec)
    def test_sampler_component(self):
        input_space = Dict(states=dict(state1=float, state2=float),
                           actions=dict(action1=float),
                           reward=float,
                           terminals=BoolBox(),
                           add_batch_rank=True)

        sampler = Sampler()
        test = ComponentTest(component=sampler,
                             input_spaces=dict(sample_size=int,
                                               inputs=input_space))

        samples = input_space.sample(size=100)
        sample = test.test(("sample", [10, samples]), expected_outputs=None)

        self.assertEqual(len(sample["actions"]["action1"]), 10)
        self.assertEqual(len(sample["states"]["state1"]), 10)
        self.assertEqual(len(sample["terminals"]), 10)

        print(sample)
    def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards(
            self):
        """
        Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring
        container actions.
        """
        env = RandomEnv(
            state_space=Dict(
                {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}),
            action_space=Dict({
                "F_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "F_forward_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "B_jump":
                BoolBox()
            }),
            reward_space=FloatBox(low=-1000.0,
                                  high=-100000.0),  # hugely negative rewards
            terminal_prob=0.0000001)

        agent_config = config_from_path(
            "configs/ppo_agent_for_random_env_with_container_spaces.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            preprocessing_spec=None,
            worker_executes_preprocessing=True,
            #episode_finish_callback=lambda episode_return, duration, timesteps, env_num:
            #print("episode return {}; steps={}".format(episode_return, timesteps))
        )
        results = worker.execute_timesteps(num_timesteps=int(1e6),
                                           use_exploration=True)

        print(results)
Ejemplo n.º 14
0
 def _prepare_loss_function_test(loss_function):
     test = ComponentTest(
         component=loss_function,
         input_spaces=dict(
             alpha=float,
             log_probs_next_sampled=FloatBox(shape=(1, ),
                                             add_batch_rank=True),
             q_values_next_sampled=Tuple(FloatBox(shape=(1, )),
                                         FloatBox(shape=(1, )),
                                         add_batch_rank=True),
             q_values=Tuple(FloatBox(shape=(1, )),
                            FloatBox(shape=(1, )),
                            add_batch_rank=True),
             log_probs_sampled=FloatBox(shape=(1, ), add_batch_rank=True),
             q_values_sampled=Tuple(FloatBox(shape=(1, )),
                                    FloatBox(shape=(1, )),
                                    add_batch_rank=True),
             rewards=FloatBox(add_batch_rank=True),
             terminals=BoolBox(add_batch_rank=True),
             loss_per_item=FloatBox(add_batch_rank=True)),
         action_space=IntBox(2, shape=(), add_batch_rank=True))
     return test
Ejemplo n.º 15
0
class TestPythonPrioritizedReplay(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the mem_prioritized_replay module.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=float),
                        reward=float,
                        terminals=BoolBox(),
                        add_batch_rank=True)
    apex_space = Dict(states=FloatBox(shape=(4, )),
                      actions=FloatBox(shape=(2, )),
                      reward=float,
                      terminals=BoolBox(),
                      weights=FloatBox(),
                      add_batch_rank=True)

    memory_variables = ["size", "index", "max-priority"]

    capacity = 10
    alpha = 1.0
    beta = 1.0

    max_priority = 1.0

    input_spaces = dict(
        # insert: records
        records=record_space,
        # get_records: num_records
        num_records=int,
        # update_records: indices, update
        indices=IntBox(add_batch_rank=True),
        update=FloatBox(add_batch_rank=True))

    # TODO These methods are all graph fns now -> unify backend tests.
    def test_insert(self):
        """
        Simply tests insert op without checking internal logic.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity,
                                      next_states=True,
                                      alpha=self.alpha,
                                      beta=self.beta)
        memory.create_variables(self.input_spaces)

        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Test chunked insert
        observation = memory.record_space_flat.sample(size=5)
        memory.insert_records(observation)

        # Also test Apex version
        memory = ApexMemory(capacity=self.capacity,
                            alpha=self.alpha,
                            beta=self.beta)
        observation = self.apex_space.sample(size=5)
        for i in range_(5):
            memory.insert_records(
                (observation['states'][i], observation['actions'][i],
                 observation['reward'][i], observation['terminals'][i],
                 observation['states'][i], observation["weights"][i]))

    def test_update_records(self):
        """
        Tests update records logic.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True)
        memory.create_variables(self.input_spaces)

        # Insert a few Elements.
        observation = memory.record_space_flat.sample(size=2)
        memory.insert_records(observation)

        # Fetch elements and their indices.
        num_records = 2
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything.
        memory.update_records(indices, np.asarray([0.1, 0.2]))

        # Test apex memory.
        memory = ApexMemory(capacity=self.capacity,
                            alpha=self.alpha,
                            beta=self.beta)
        observation = self.apex_space.sample(size=5)
        for i in range_(5):
            memory.insert_records(
                (ray_compress(observation["states"][i]),
                 observation["actions"][i], observation["reward"][i],
                 observation["terminals"][i], observation["weights"][i]))

        # Fetch elements and their indices.
        num_records = 5
        batch = memory.get_records(num_records)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))

        # Does not return anything
        memory.update_records(indices, np.random.uniform(size=10))

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = MemPrioritizedReplay(capacity=self.capacity,
                                      next_states=True,
                                      alpha=self.alpha,
                                      beta=self.beta)
        memory.create_variables(self.input_spaces)

        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values
        min_segment_values = memory.merged_segment_tree.min_segment_tree.values

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float('inf'))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)

        # Insert 1 Element.
        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = memory.record_space_flat.sample(size=1)
        memory.insert_records(observation)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

    def test_tree_insert(self):
        """
        Tests inserting into the segment tree and querying segments.
        """
        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        assert np.isclose(tree.get_sum(), 4.0)
        assert np.isclose(tree.get_sum(0, 2), 0.0)
        assert np.isclose(tree.get_sum(0, 3), 1.0)
        assert np.isclose(tree.get_sum(2, 3), 1.0)
        assert np.isclose(tree.get_sum(2, -1), 1.0)
        assert np.isclose(tree.get_sum(2, 4), 4.0)

    def test_prefixsum_idx(self):
        """
        Tests fetching the index corresponding to a prefix sum.
        """
        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)

        self.assertEqual(tree.index_of_prefixsum(0.0), 2)
        self.assertEqual(tree.index_of_prefixsum(0.5), 2)
        self.assertEqual(tree.index_of_prefixsum(0.99), 2)
        self.assertEqual(tree.index_of_prefixsum(1.01), 3)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(4.0), 3)

        memory = ApexMemory(capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(0, 0.5)
        tree.insert(1, 1.0)
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertEqual(tree.index_of_prefixsum(0.0), 0)
        self.assertEqual(tree.index_of_prefixsum(0.55), 1)
        self.assertEqual(tree.index_of_prefixsum(0.99), 1)
        self.assertEqual(tree.index_of_prefixsum(1.51), 2)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(5.50), 3)
    def test_custom_margin_demos_with_container_actions(self):
        # Tests if using different margins per sample works.
        # Same state, but different
        vocab_size = 100
        embed_dim = 8
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10,))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(
                low=0,
                high=num_outputs
            )
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense", units=embed_dim, activation="relu", scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=state_space,
            action_space=actions_space
        )
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Create a set of demos.
        demo_states = agent.preprocessed_state_space.with_batch_rank().sample(2)
        # Same state.
        demo_states[1] = demo_states[0]
        demo_actions = actions_space.with_batch_rank().sample(2)

        for name, action in actions_space.items():
            demo_actions[name][0] = 0
            demo_actions[name][1] = 1

        demo_rewards = rewards.sample(2, fill_value=.0)
        # One action has positive reward, one negative
        demo_rewards[0] = 0
        demo_rewards[1] = 0

        # One action is encouraged, one is discouraged.
        margins = np.asarray([0.5, -0.5])

        demo_next_states = agent.preprocessed_state_space.with_batch_rank().sample(2)
        demo_terminals = terminals.sample(2, fill_value=False)

        # When using margins, need to use external batch.
        batch = dict(
            states=demo_states,
            actions=demo_actions,
            rewards=demo_rewards,
            next_states=demo_next_states,
            importance_weights=np.ones_like(demo_rewards),
            terminals=demo_terminals,
        )
        # Fit demos with custom margins.
        for _ in range(10000):
            agent.update(batch=batch, update_from_demos=False, apply_demo_loss_to_batch=True, expert_margins=margins)

        # Evaluate demos for the state -> should have action with positive reward.
        agent_actions = agent.get_action(np.array([demo_states[0]]), apply_preprocessing=False, use_exploration=False)
        print("learned action = ", agent_actions)
Ejemplo n.º 17
0
    def __init__(
        self,
        state_space,
        action_space,
        discount=0.98,
        preprocessing_spec=None,
        network_spec=None,
        internal_states_space=None,
        policy_spec=None,
        exploration_spec=None,
        execution_spec=None,
        optimizer_spec=None,
        observe_spec=None,
        update_spec=None,
        summary_spec=None,
        saver_spec=None,
        auto_build=True,
        name="dqfd-agent",
        expert_margin=0.5,
        supervised_weight=1.0,
        double_q=True,
        dueling_q=True,
        huber_loss=False,
        n_step=1,
        shared_container_action_target=False,
        memory_spec=None,
        demo_memory_spec=None,
        demo_sample_ratio=0.2,
    ):

        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
        """
        # Fix action-adapter before passing it to the super constructor.
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            if policy_spec is None:
                policy_spec = {}
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            exploration_spec=exploration_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            auto_build=auto_build,
            name=name
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.expert_margin = expert_margin

        self.batch_size = self.update_spec["batch_size"]
        self.default_margins = np.asarray([self.expert_margin] * self.batch_size)

        self.demo_batch_size = int(demo_sample_ratio * self.update_spec["batch_size"] / (1.0 - demo_sample_ratio))
        self.demo_margins = np.asarray([self.expert_margin] * self.demo_batch_size)
        self.shared_container_action_target = shared_container_action_target

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            expert_margins=FloatBox(add_batch_rank=True),
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        self.use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=self.use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True
Ejemplo n.º 18
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="sac-agent",
                 double_q=True,
                 initial_alpha=1.0,
                 gumbel_softmax_temperature=1.0,
                 target_entropy=None,
                 memory_spec=None,
                 value_function_sync_spec=None):
        """
        This is an implementation of the Soft-Actor Critic algorithm.

        Paper: http://arxiv.org/abs/1801.01290

        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            double_q (bool): Whether to train two q networks independently.
            initial_alpha (float): "The temperature parameter α determines the
                relative importance of the entropy term against the reward".
            gumbel_softmax_temperature (float): Temperature parameter for the Gumbel-Softmax distribution used
                for discrete actions.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            update_spec (dict): Here we can have sync_interval or sync_tau (for the value network update).
        """
        # If VF spec is a network spec, wrap with SAC vf type. The VF must concatenate actions and states,
        # which can require splitting the network in the case of e.g. conv-inputs.
        if isinstance(value_function_spec, list):
            value_function_spec = dict(type="sac_value_function",
                                       network_spec=value_function_spec)
            self.logger.info("Using default SAC value function.")
        elif isinstance(value_function_spec, ValueFunction):
            self.logger.info(
                "Using value function object {}".format(ValueFunction))

        if policy_spec is None:
            # Continuous action space: Use squashed normal.
            # Discrete: Gumbel-softmax.
            policy_spec = dict(
                deterministic=False,
                distributions_spec=dict(
                    bounded_distribution_type="squashed",
                    discrete_distribution_type="gumbel_softmax",
                    gumbel_softmax_temperature=gumbel_softmax_temperature))

        super(SACAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            auto_build=auto_build,
            name=name)

        self.double_q = double_q
        self.target_entropy = target_entropy
        self.initial_alpha = initial_alpha

        # Assert that the synch interval is a multiple of the update_interval.
        if "sync_interval" in self.update_spec:
            if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                    self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
                raise RLGraphError(
                    "ERROR: sync_interval ({}) must be multiple of update_interval "
                    "({})!".format(self.update_spec["sync_interval"],
                                   self.update_spec["update_interval"]))
        elif "sync_tau" in self.update_spec:
            if self.update_spec["sync_tau"] <= 0 or self.update_spec[
                    "sync_tau"] > 1.0:
                raise RLGraphError(
                    "sync_tau ({}) must be in interval (0.0, 1.0]!".format(
                        self.update_spec["sync_tau"]))
        else:
            self.update_spec[
                "sync_tau"] = 0.005  # The value mentioned in the paper

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        #self.iterations = self.update_spec["num_iterations"]
        self.batch_size = self.update_spec["batch_size"]

        float_action_space = self.action_space.with_batch_rank().map(
            mapping=lambda flat_key, space: space.as_one_hot_float_space()
            if isinstance(space, IntBox) else space)

        self.input_spaces.update(
            dict(env_actions=self.action_space.with_batch_rank(),
                 actions=float_action_space,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 next_states=preprocessed_state_space,
                 states=self.state_space.with_batch_rank(add_batch_rank=True),
                 batch_size=int,
                 importance_weights=FloatBox(add_batch_rank=True),
                 deterministic=bool,
                 weights="variables:{}".format(self.policy.scope)))

        if value_function_sync_spec is None:
            value_function_sync_spec = SyncSpecification(
                sync_interval=self.update_spec["sync_interval"] //
                self.update_spec["update_interval"],
                sync_tau=self.update_spec["sync_tau"]
                if "sync_tau" in self.update_spec else 5e-3)

        self.memory = Memory.from_spec(memory_spec)
        self.alpha_optimizer = self.optimizer.copy(
            scope="alpha-" +
            self.optimizer.scope) if self.target_entropy is not None else None

        self.root_component = SACAgentComponent(
            agent=self,
            policy=self.policy,
            q_function=self.value_function,
            preprocessor=self.preprocessor,
            memory=self.memory,
            discount=self.discount,
            initial_alpha=self.initial_alpha,
            target_entropy=target_entropy,
            optimizer=self.optimizer,
            vf_optimizer=self.value_function_optimizer,
            alpha_optimizer=self.alpha_optimizer,
            q_sync_spec=value_function_sync_spec,
            num_q_functions=2 if self.double_q is True else 1)

        extra_optimizers = [self.value_function_optimizer]
        if self.alpha_optimizer is not None:
            extra_optimizers.append(self.alpha_optimizer)
        self.build_options = dict(optimizers=extra_optimizers)

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)
            self.graph_built = True
Ejemplo n.º 19
0
    def test_sac_agent_component_functionality(self):
        config = config_from_path(
            "configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(8, ))
        continuous_action_space = FloatBox(shape=(1, ), low=-2.0, high=2.0)
        terminal_space = BoolBox(add_batch_rank=True)
        rewards_space = FloatBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"],
                                  action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = ValueFunction.from_spec(config["value_function"])

        agent_component = SACAgentComponent(
            agent=None,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(
                config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2)

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                env_actions=continuous_action_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=rewards_space,
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                preprocessed_s_prime=state_space.with_batch_rank(),
                importance_weights=FloatBox(add_batch_rank=True),
                preprocessed_next_states=state_space.with_batch_rank(),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                # TODO: how to provide the space for multiple component variables?
                #q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                #)
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer, ),
            ))

        batch_size = 10
        action_sample = continuous_action_space.with_batch_rank().sample(
            batch_size)
        rewards = rewards_space.sample(batch_size)
        # Check, whether an update runs ok.
        result = test.test((
            "update_from_external_batch",
            [
                state_space.sample(batch_size),
                action_sample,
                rewards,
                [True] * batch_size,
                state_space.sample(batch_size),
                [1.0] * batch_size  # importance
            ]))
        self.assertTrue(result["actor_loss"].dtype == np.float32)
        self.assertTrue(result["critic_loss"].dtype == np.float32)

        action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1))
        q_values = test.test(
            ("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            self.assertTrue(q_val.dtype == np.float32)
            self.assertTrue(q_val.shape == (batch_size, 1))

        action_sample, _ = test.test(("action_from_preprocessed_state",
                                      [state_space.sample(batch_size), False]))
        self.assertTrue(action_sample.dtype == np.float32)
        self.assertTrue(action_sample.shape == (batch_size, 1))
Ejemplo n.º 20
0
    def __init__(self,
                 double_q=True,
                 dueling_q=True,
                 huber_loss=False,
                 n_step=1,
                 shared_container_action_target=True,
                 memory_spec=None,
                 store_last_memory_batch=False,
                 store_last_q_table=False,
                 **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128

        super(DQNAgent, self).__init__(policy_spec=policy_spec,
                                       name=kwargs.pop("name", "dqn-agent"),
                                       **kwargs)

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        #self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"],
                               self.update_spec["update_interval"]))

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(
                actions=self.action_space.with_batch_rank(),
                # weights will have a Space derived from the vars of policy.
                policy_weights="variables:{}".format(self.policy.scope),
                time_step=int,
                use_exploration=bool,
                preprocessed_states=preprocessed_state_space,
                rewards=reward_space,
                terminals=terminal_space,
                next_states=preprocessed_state_space,
                preprocessed_next_states=preprocessed_state_space,
                importance_weights=weight_space,
            ))
        if self.value_function is not None:
            self.input_spaces[
                "value_function_weights"] = "variables:{}".format(
                    self.value_function.scope),

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "next_states",
                                 "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals", "next_states")

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity,\
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy",
                                              trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount,
            double_q=self.double_q,
            huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights,
            n_step=n_step)

        self.root_component.add_components(
            self.preprocessor,
            self.merger,
            self.memory,
            self.splitter,
            self.policy,
            self.target_policy,
            self.value_function,
            self.value_function_optimizer,  # <- should both be None for DQN
            self.exploration,
            self.loss_function,
            self.optimizer,
            self.vars_merger,
            self.vars_splitter)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True
Ejemplo n.º 21
0
    def test_sac_agent_component_on_fake_env(self):
        config = config_from_path("configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(2,))
        continuous_action_space = FloatBox(low=-1.0, high=1.0)
        terminal_space = BoolBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"], action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = ValueFunction.from_spec(config["value_function"])

        agent_component = SACAgentComponent(
            agent=None,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2
        )

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=FloatBox(add_batch_rank=True),
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                preprocessed_s_prime=state_space.with_batch_rank(),
                importance_weights=FloatBox(add_batch_rank=True),
                preprocessed_next_states=state_space.with_batch_rank(),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                # TODO: how to provide the space for multiple component variables?
                # q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                # )
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer,
                ),
            )
        )

        policy_loss = []
        vf_loss = []

        # This test simulates an env that always requires actions to be close to the max-pdf
        # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs.
        # The component should learn to produce actions like that (close to 0.5).
        true_mean = 0.5
        target_dist = stats.norm(loc=true_mean, scale=0.2)
        batch_size = 100
        for _ in range(5000):
            action_sample = continuous_action_space.sample(batch_size)
            rewards = target_dist.pdf(action_sample)
            result = test.test(("update_from_external_batch", [
                state_space.sample(batch_size),
                action_sample,
                rewards,
                [True] * batch_size,
                state_space.sample(batch_size),
                [1.0] * batch_size  # importance
            ]))
            policy_loss.append(result["actor_loss"])
            vf_loss.append(result["critic_loss"])

        self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:]))
        self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:]))

        action_sample = np.linspace(-1, 1, batch_size)
        q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            q_val = q_val.flatten()
            np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2)

        action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False]))
        action_sample = action_sample.flatten()
        np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)
Ejemplo n.º 22
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="ppo-agent",
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 value_function_clipping=None,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.

            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.

            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.

            clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards`
                range.

            value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None,
                uses simple value function objective.

            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.standardize_advantages = standardize_advantages
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            value_function_clipping=value_function_clipping,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.policy,
            self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True
Ejemplo n.º 23
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="actor-critic-agent",
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.gae_function
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True
Ejemplo n.º 24
0
    def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, memory_spec=None,
                 store_last_memory_batch=False, store_last_q_table=False, **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        action_adapter_spec = kwargs.pop("action_adapter_spec", dict())
        # Use a DuelingActionAdapter (instead of a basic ActionAdapter) if option is set.
        if dueling_q is True:
            action_adapter_spec["type"] = "dueling-action-adapter"
            assert "units_state_value_stream" in action_adapter_spec
            assert "units_advantage_stream" in action_adapter_spec
        super(DQNAgent, self).__init__(
            action_adapter_spec=action_adapter_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs
        )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            weights="variables:policy",
            time_step=int,
            use_exploration=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space,
            # TODO: This is currently necessary for multi-GPU handling (as the update_from_external_batch
            # TODO: gets overridden by a generic function with args=*inputs)
            #inputs=[preprocessed_state_space, self.action_space.with_batch_rank(), reward_space, terminal_space,
            #        preprocessed_state_space, weight_space]
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals")
        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        self.target_policy.add_components(Synchronizable(), expose_apis="sync")
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQNLossFunction(
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            importance_weights=use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy,
                          self.target_policy, self.exploration, self.loss_function, self.optimizer]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api("policy", "preprocessor-stack", self.optimizer.scope, *sub_components)

        # markup = get_graph_markup(self.graph_builder.root_component)
        # print(markup)
        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True
Ejemplo n.º 25
0
class TestPythonMemoryPerformance(unittest.TestCase):
    record_space = Dict(
        states=FloatBox(shape=(4,)),
        actions=FloatBox(shape=(2,)),
        reward=float,
        terminals=BoolBox(),
        add_batch_rank=True
    )

    # Apex params
    capacity = 2000000
    chunksize = 64
    inserts = 1000000

    # Samples.
    samples = 10000
    sample_batch_size = 50

    alpha = 0.6
    beta = 0.4
    max_priority = 1.0

    def test_ray_prioritized_replay_insert(self):
        """
        Tests Ray's memory performance.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        # Test individual inserts.
        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]

        start = time.monotonic()
        for record in records:
            memory.add(
                obs_t=record['states'],
                action=record['actions'],
                reward=record['reward'],
                obs_tp1=record['states'],
                done=record['terminals'],
                weight=None
            )
        end = time.monotonic() - start
        tp = len(records) / end
        print('#### Testing Ray Prioritized Replay memory ####')
        print('Testing insert performance:')
        print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))

        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )

        # Test chunked inserts -> done via external for loop in Ray.
        chunks = int(self.inserts / self.chunksize)
        records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)]
        start = time.monotonic()
        for chunk in records:
            for i in range_(self.chunksize):
                memory.add(
                    obs_t=chunk['states'][i],
                    action=chunk['actions'][i],
                    reward=chunk['reward'][i],
                    obs_tp1=chunk['states'][i],
                    done=chunk['terminals'][i],
                    weight=None
                )
        end = time.monotonic() - start
        tp = len(records) * self.chunksize / end
        print('Testing chunked insert performance:')
        print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))

    def test_ray_sampling(self):
        """
        Tests Ray's memory performance.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.add(
                obs_t=ray_compress(record['states']),
                action=record['actions'],
                reward=record['reward'],
                obs_tp1=ray_compress(record['states']),
                done=record['terminals'],
                weight=None
            )
        start = time.monotonic()
        for _ in range_(self.samples):
            batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
        end = time.monotonic() - start
        tp = self.samples / end
        print('#### Testing Ray Prioritized Replay memory ####')
        print('Testing sampling performance:')
        print('Sampled {} batches, throughput: {} samples/s, total time: {} s'.format(
            self.samples, tp, end
        ))

    def test_ray_updating(self):
        """
        Tests Ray's memory performance.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.add(
                obs_t=record['states'],
                action=record['actions'],
                reward=record['reward'],
                obs_tp1=record['states'],
                done=record['terminals'],
                weight=None
            )
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)]
        indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _
                   in range_(self.samples)]

        start = time.monotonic()
        for index, loss in zip(indices, loss_values):
            memory.update_priorities(index, loss)
        end = time.monotonic() - start
        tp = len(indices) / end
        print('#### Testing Ray Prioritized Replay memory ####')
        print('Testing updating performance:')
        print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format(
            len(indices), tp, end
        ))

    def test_rlgraph_apex_insert(self):
        """
        Tests RLgraph's python memory performance.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )
        # Testing insert performance
        records = [self.record_space.sample(size=1) for _ in range(self.inserts)]

        start = time.monotonic()
        for record in records:
            memory.insert_records((
                 record['states'],
                 record['actions'],
                 record['reward'],
                 record['terminals'],
                 None
            ))
        end = time.monotonic() - start
        tp = len(records) / end

        print('#### Testing RLGraph python prioritized replay ####')
        print('Testing insert performance:')
        print('Inserted {} separate records, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))

        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )
        chunks = int(self.inserts / self.chunksize)
        records = [self.record_space.sample(size=self.chunksize) for _ in range_(chunks)]
        start = time.monotonic()
        for chunk in records:
            for i in range_(self.chunksize):
                memory.insert_records((
                    chunk['states'][i],
                    chunk['actions'][i],
                    chunk['reward'][i],
                    chunk['terminals'][i],
                    None
                ))

        end = time.monotonic() - start
        tp = len(records) * self.chunksize / end
        print('Testing chunked insert performance:')
        print('Inserted {} chunks, throughput: {} records/s, total time: {} s'.format(
            len(records), tp, end
        ))

    def test_rlgraph_sampling(self):
        """
        Tests RLgraph's sampling performance.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )

        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.insert_records((
                 ray_compress(record['states']),
                 record['actions'],
                 record['reward'],
                 record['terminals'],
                 None
            ))
        start = time.monotonic()
        for _ in range_(self.samples):
            batch_tuple = memory.get_records(self.sample_batch_size)
        end = time.monotonic() - start
        tp = self.samples / end
        print('#### Testing RLGraph Prioritized Replay memory ####')
        print('Testing sampling performance:')
        print('Sampled {} batches, throughput: {} batches/s, total time: {} s'.format(
            self.samples, tp, end
        ))

    def test_rlgraph_updating(self):
        """
        Tests RLGraph's memory performance.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )

        records = [self.record_space.sample(size=1) for _ in range_(self.inserts)]
        for record in records:
            memory.insert_records((
                 record['states'],
                 record['actions'],
                 record['reward'],
                 record['terminals'],
                 None
            ))
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(self.samples)]
        indices = [np.random.randint(low=0, high=self.inserts, size=self.sample_batch_size) for _
                   in range_(self.samples)]

        start = time.monotonic()
        for index, loss in zip(indices, loss_values):
            memory.update_records(index, loss)
        end = time.monotonic() - start
        tp = len(indices) / end
        print('#### Testing RLGraph Prioritized Replay memory ####')
        print('Testing updating performance:')
        print('Updates {} loss batches, throughput: {} updates/s, total time: {} s'.format(
            len(indices), tp, end
        ))

    def test_ray_combined_ops(self):
        """
        Tests a combined workflow of insert, sample, update on the prioritized replay memory.
        """
        assert get_distributed_backend() == "ray"
        memory = PrioritizedReplayBuffer(
            size=self.capacity,
            alpha=1.0,
            clip_rewards=True
        )
        chunksize = 32

        # Test chunked inserts -> done via external for loop in Ray.
        chunks = int(self.inserts / chunksize)
        records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)]
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)]
        start = time.monotonic()

        for chunk, loss_values in zip(records, loss_values):
            # Insert.
            for i in range_(chunksize):
                memory.add(
                    obs_t=ray_compress(chunk['states'][i]),
                    action=chunk['actions'][i],
                    reward=chunk['reward'][i],
                    obs_tp1=ray_compress(chunk['states'][i]),
                    done=chunk['terminals'][i],
                    weight=None
                )
            # Sample.
            batch_tuple = memory.sample(self.sample_batch_size, beta=1.0)
            indices = batch_tuple[-1]
            # Update
            memory.update_priorities(indices, loss_values)

        end = time.monotonic() - start
        tp = len(records) / end
        print('Ray: testing combined insert/sample/update performance:')
        print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format(
            len(records), tp, end
        ))

    def test_rlgraph_combined_ops(self):
        """
        Tests a combined workflow of insert, sample, update on the prioritized replay memory.
        """
        memory = ApexMemory(
            capacity=self.capacity,
            alpha=1.0
        )

        chunksize = 32
        chunks = int(self.inserts / chunksize)
        records = [self.record_space.sample(size=chunksize) for _ in range_(chunks)]
        loss_values = [np.random.random(size=self.sample_batch_size) for _ in range_(chunks)]

        start = time.monotonic()
        for chunk, loss_values in zip(records, loss_values):
            # Each record now is a chunk.
            for i in range_(chunksize):
                memory.insert_records((
                    ray_compress(chunk['states'][i]),
                    chunk['actions'][i],
                    chunk['reward'][i],
                    chunk['terminals'][i],
                    None
                ))
            batch, indices, weights = memory.get_records(self.sample_batch_size)
            memory.update_records(indices, loss_values)

        end = time.monotonic() - start
        tp = len(records) / end
        print('RLGraph: Testing combined op performance:')
        print('Ran {} combined ops, throughput: {} combined ops/s, total time: {} s'.format(
            len(records), tp, end
        ))
Ejemplo n.º 26
0
    def __init__(self, expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True,
                 huber_loss=False, n_step=1, shared_container_action_target=True,
                 memory_spec=None, demo_memory_spec=None,
                 demo_sample_ratio=0.2, store_last_memory_batch=False, store_last_q_table=False, **kwargs):
        # TODO Most of this is DQN duplicate but the way the loss function is instantiated, inheriting
        # from DQN does not work well.
        """
        Args:
            expert_margin (float): The expert margin enforces a distance in Q-values between expert action and
                all other actions.
            supervised_weight (float): Indicates weight of the expert loss.
            double_q (bool): Whether to use the double DQN loss function (see [2]).
            dueling_q (bool): Whether to use a dueling layer in the ActionAdapter  (see [3]).
            huber_loss (bool) : Whether to apply a Huber loss. (see [4]).
            n_step (Optional[int]): n-step adjustment to discounting.
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use.
            demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use.
            store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in
                `self.last_memory_batch` for debugging purposes.
                Default: False.
            store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch
                (memory or external) in `self.last_q_table` for debugging purposes.
                Default: False.
        """
        # Fix action-adapter before passing it to the super constructor.
        policy_spec = kwargs.pop("policy_spec", dict())
        # Use a DuelingPolicy (instead of a basic Policy) if option is set.
        if dueling_q is True:
            policy_spec["type"] = "dueling-policy"
            # Give us some default state-value nodes.
            if "units_state_value_stream" not in policy_spec:
                policy_spec["units_state_value_stream"] = 128
        super(DQFDAgent, self).__init__(
            policy_spec=policy_spec, name=kwargs.pop("name", "dqfd-agent"), **kwargs
        )
        # Assert that the synch interval is a multiple of the update_interval.
        if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \
                self.update_spec["sync_interval"] // self.update_spec["update_interval"]:
            raise RLGraphError(
                "ERROR: sync_interval ({}) must be multiple of update_interval "
                "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])
            )

        self.double_q = double_q
        self.dueling_q = dueling_q
        self.huber_loss = huber_loss
        self.demo_batch_size = int(demo_sample_ratio * self.update_spec['batch_size'] / (1.0 - demo_sample_ratio))
        self.shared_container_action_target = shared_container_action_target

        # Debugging tools.
        self.store_last_memory_batch = store_last_memory_batch
        self.last_memory_batch = None
        self.store_last_q_table = store_last_q_table
        self.last_q_table = None

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        weight_space = FloatBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            time_step=int,
            use_exploration=bool,
            demo_batch_size=int,
            apply_demo_loss=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            next_states=preprocessed_state_space,
            preprocessed_next_states=preprocessed_state_space,
            importance_weights=weight_space
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals")

        # The replay memory.
        self.memory = Memory.from_spec(memory_spec)
        # Cannot have same default name.
        demo_memory_spec["scope"] = "demo-memory"
        self.demo_memory = Memory.from_spec(demo_memory_spec)

        # The splitter for splitting up the records from the memories.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states")

        # Copy our Policy (target-net), make target-net synchronizable.
        self.target_policy = self.policy.copy(scope="target-policy", trainable=False)
        # Number of steps since the last target-net synching from the main policy.
        self.steps_since_target_net_sync = 0

        use_importance_weights = isinstance(self.memory, PrioritizedReplay)
        self.loss_function = DQFDLossFunction(
            expert_margin=expert_margin, supervised_weight=supervised_weight,
            discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss,
            shared_container_action_target=shared_container_action_target,
            importance_weights=use_importance_weights, n_step=n_step
        )

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy,
            self.target_policy, self.exploration, self.loss_function, self.optimizer
        )

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"])
            self.graph_built = True
Ejemplo n.º 27
0
class TestRingBufferMemory(unittest.TestCase):
    """
    Tests the ring buffer. The ring buffer has very similar tests to
    the replay memory as it supports similar insertion and retrieval semantics,
    but needs additional tests on episode indexing and its latest semantics.
    """

    record_space = Dict(
        states=dict(state1=float, state2=float),
        actions=dict(action1=float),
        rewards=float,
        terminals=BoolBox(),
        sequence_indices=BoolBox(),
        add_batch_rank=True
    )
    # Generic memory variables.
    memory_variables = ["size", "index"]

    # Ring buffer variables
    ring_buffer_variables = ["size", "index", "num-episodes", "episode-indices"]
    capacity = 10

    input_spaces = dict(
        records=record_space,
        num_records=int,
        num_episodes=int
    )
    input_spaces_no_episodes = dict(
        records=record_space,
        num_records=int,
    )

    def test_capacity_with_episodes(self):
        """
        Tests if inserts of non-terminals work.

        Note that this does not test episode semantics itself, which are tested below.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)
            # Internal memory variables.
            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            size_value = ring_buffer_variables["size"]
            index_value = ring_buffer_variables["index"]
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # Assert indices 0 before insert.
            self.assertEqual(size_value, 0)
            self.assertEqual(index_value, 0)
            self.assertEqual(num_episodes_value, 0)
            self.assertEqual(np.sum(episode_index_values), 0)

            # Insert one more element than capacity. Note: this is different than
            # replay test because due to episode semantics, it matters if
            # these are terminal or not. This tests if episode index updating
            # causes problems if none of the inserted elements are terminal.
            observation = non_terminal_records(self.record_space, self.capacity + 1)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            size_value = ring_buffer_variables["size"]
            index_value = ring_buffer_variables["index"]
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # Size should be equivalent to capacity when full.
            self.assertEqual(size_value, self.capacity)

            # Index should be one over capacity due to modulo.
            self.assertEqual(index_value, 1)
            self.assertEqual(num_episodes_value, 0)
            self.assertEqual(np.sum(episode_index_values), 0)

            # If we fetch n elements, we expect to see exactly the last n.
            for last_n in range(1, 6):
                batch = test.test(("get_records", last_n), expected_outputs=None)
                recursive_assert_almost_equal(batch["actions"]["action1"], observation["actions"]["action1"][-last_n:])
                recursive_assert_almost_equal(batch["states"]["state2"], observation["states"]["state2"][-last_n:])
                recursive_assert_almost_equal(batch["terminals"], observation["terminals"][-last_n:])

    def test_episode_indices_when_inserting(self):
        """
        Tests if episodes indices and counts are set correctly when inserting
        terminals.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)

            # First, we insert a single terminal record.
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            # Internal memory variables.
            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # One episode should be present.
            self.assertEqual(num_episodes_value, 1)
            # However, the index of that episode is 0, so we cannot fetch it.
            self.assertEqual(sum(episode_index_values), 0)

            # Next, we insert 1 non-terminal, then 1 terminal element.
            observation = non_terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            # Now, we expect to have 2 episodes with episode indices at 0 and 2.
            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            print('Episode indices after = {}'.format(episode_index_values))
            self.assertEqual(num_episodes_value, 2)
            self.assertEqual(episode_index_values[1], 2)

    def test_only_terminal_with_episodes(self):
        """
        Edge case: What if only terminals are inserted when episode
        semantics are enabled?
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)
            observation = terminal_records(self.record_space, self.capacity)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            self.assertEqual(num_episodes_value, self.capacity)
            # Every episode index should correspond to its position
            for i in range(self.capacity):
                self.assertEqual(episode_index_values[i], i)

    def test_episode_fetching(self):
        """
        Test if we can accurately fetch most recent episodes.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)

            # Insert 2 non-terminals, 1 terminal
            observation = non_terminal_records(self.record_space, 2)
            test.test(("insert_records", observation), expected_outputs=None)
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # One episode.
            self.assertEqual(num_episodes_value, 1)
            expected_indices = [0] * self.capacity
            expected_indices[0] = 2
            recursive_assert_almost_equal(episode_index_values, expected_indices)

            # We should now be able to retrieve one episode of length 3.
            episode = test.test(("get_episodes", 1), expected_outputs=None)
            expected_terminals = [0, 0, 1]
            recursive_assert_almost_equal(episode["terminals"], expected_terminals)

            # We should not be able to retrieve two episodes, and still return just one.
            episode = test.test(("get_episodes", 2), expected_outputs=None)
            expected_terminals = [0, 0, 1]
            recursive_assert_almost_equal(episode["terminals"], expected_terminals)

            # Insert 7 non-terminals.
            observation = non_terminal_records(self.record_space, 7)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            index_value = ring_buffer_variables["index"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # Episode indices should not have changed.
            expected_indices[0] = 2
            recursive_assert_almost_equal(episode_index_values, expected_indices)
            # Inserted 2 non-terminal, 1 terminal, 7 non-terminal at capacity 10 -> should be at 0 again.
            self.assertEqual(index_value, 0)

            # Now inserting one terminal so the terminal buffer has layout [1 0 1 0 0 0 0 0 0 0]
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            # Episode indices:
            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            recursive_assert_almost_equal(num_episodes_value, 2)

            # # Check if we can fetch 2 episodes:
            episodes = test.test(("get_episodes", 2), expected_outputs=None)
            #
            # # We now expect to have retrieved:
            # # - 10 time steps
            # # - 2 terminal values 1
            # # - Terminal values spaced apart 1 index due to the insertion order
            self.assertEqual(len(episodes['terminals']), self.capacity)
            self.assertEqual(episodes['terminals'][0], True)
            self.assertEqual(episodes['terminals'][2], True)

    def test_latest_batch(self):
        """
        Tests if we can fetch latest steps.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)

            # Insert 5 random elements.
            observation = non_terminal_records(self.record_space, 5)
            test.test(("insert_records", observation), expected_outputs=None)

            # First, test if the basic computation works.
            batch = test.test(("get_records", 5), expected_outputs=None)
            recursive_assert_almost_equal(batch, observation)

            # Next, insert capacity more elements:
            observation = non_terminal_records(self.record_space, self.capacity)
            test.test(("insert_records", observation), expected_outputs=None)

            # If we now fetch capacity elements, we expect to see exactly the last 10.
            batch = test.test(("get_records", self.capacity), expected_outputs=None)
            recursive_assert_almost_equal(batch, observation)

            # If we fetch n elements, we expect to see exactly the last n.
            for last_n in range(1, 6):
                batch = test.test(("get_records", last_n), expected_outputs=None)
                recursive_assert_almost_equal(batch["actions"]["action1"], observation["actions"]["action1"][-last_n:])
                recursive_assert_almost_equal(batch["states"]["state2"], observation["states"]["state2"][-last_n:])
                recursive_assert_almost_equal(batch["terminals"], observation["terminals"][-last_n:])
Ejemplo n.º 28
0
    def __init__(self,
                 gae_lambda=1.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None,
                 **kwargs):
        """
        Args:
            gae_lambda (float): Lambda for generalized advantage estimation.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        super(ActorCriticAgent, self).__init__(
            policy_spec=dict(deterministic=False),  # Set policy to stochastic.
            name=kwargs.pop("name", "actor-critic-agent"),
            **kwargs)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = DictMerger("states", "actions", "rewards", "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer),\
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.loss_function = ActorCriticLossFunction(
            discount=self.discount,
            gae_lambda=gae_lambda,
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                batch_size=self.update_spec["batch_size"],
                build_options=dict(vf_optimizer=self.value_function_optimizer))

            self.graph_built = True
Ejemplo n.º 29
0
    def __init__(self,
                 world="4x4",
                 save_mode=False,
                 action_type="udlr",
                 reward_function="sparse",
                 state_representation="discrete"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a
                discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete
                action space. "ftjb" is the same as "ftj", except that sub-action "jump" is a boolean.

            reward_function (str): One of
                sparse: hole=-5, fire=-3, goal=1, all other steps=-0.1
                rich: hole=-100, fire=-10, goal=50, all other steps=-0.1

            state_representation (str):
                - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one
                    below, etc..
                - "xy": The x and y grid position tuple.
                - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values
                    of the actor.
                - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are
                    used to indicate different items in the scene (walls, holes, the actor, etc..).
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        self.world = world
        self.n_row, self.n_col = self.world.shape
        (start_y, ), (start_x, ) = np.nonzero(self.world == "S")

        # Init pygame (if installed) for visualizations.
        if pygame is not None:
            self.pygame_field_size = 30
            pygame.init()
            self.pygame_agent = pygame.image.load(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "images/agent.png"))
            # Create basic grid Surface for reusage.
            self.pygame_basic_surface = self.grid_to_surface()
            self.pygame_display_set = False

        # Figure out our state space.
        assert state_representation in [
            "discrete", "xy", "xy+orientation", "camera"
        ]
        self.state_representation = state_representation
        # Discrete states (single int from 0 to n).
        if self.state_representation == "discrete":
            state_space = IntBox(self.n_row * self.n_col)
        # x/y position (2 ints).
        elif self.state_representation == "xy":
            state_space = IntBox(low=(0, 0),
                                 high=(self.n_col, self.n_row),
                                 shape=(2, ))
        # x/y position + orientation (3 ints).
        elif self.state_representation == "xy+orientation":
            state_space = IntBox(low=(0, 0, 0, 0),
                                 high=(self.n_col, self.n_row, 1, 1))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = IntBox(0, 255, shape=(self.n_row, self.n_col, 3))

        self.default_start_pos = self.get_discrete_pos(start_x, start_y)
        self.discrete_pos = self.default_start_pos

        assert reward_function in ["sparse",
                                   "rich"]  # TODO: "potential"-based reward
        self.reward_function = reward_function

        # Store the goal position for proximity calculations (for "potential" reward function).
        (self.goal_y, ), (self.goal_x, ) = np.nonzero(self.world == "G")

        # Specify the actual action spaces.
        self.action_type = action_type
        action_space = IntBox(4) if self.action_type == "udlr" else Dict(
            dict(forward=IntBox(3),
                 turn=IntBox(3),
                 jump=(IntBox(2) if self.action_type == "ftj" else BoolBox())))

        # Call the super's constructor.
        super(GridWorld, self).__init__(state_space=state_space,
                                        action_space=action_space)

        # Reset ourselves.
        self.state = None
        self.orientation = None  # int: 0, 90, 180, 270
        self.camera_pixels = None  # only used, if state_representation=='cam'
        self.reward = None
        self.is_terminal = None
        self.reset(randomize=False)
Ejemplo n.º 30
0
class TestPrioritizedReplay(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the prioritized_replay module.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=float),
                        reward=float,
                        terminals=BoolBox(),
                        add_batch_rank=True)
    memory_variables = ["size", "index", "max-priority"]

    capacity = 10
    alpha = 1.0
    beta = 1.0

    max_priority = 1.0

    input_spaces = dict(
        # insert: records
        records=record_space,
        # get_records: num_records
        num_records=int,
        # update_records: indices, update
        indices=IntBox(add_batch_rank=True),
        update=FloatBox(add_batch_rank=True))

    def test_insert(self):
        """
        Simply tests insert op without checking internal logic.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        observation = self.record_space.sample(size=1)
        test.test(("insert_records", observation), expected_outputs=None)

    def test_capacity(self):
        """
        Tests if insert correctly manages capacity.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Internal state variables.
        memory_variables = memory.get_variables(self.memory_variables,
                                                global_scope=False)
        buffer_size = memory_variables['size']
        buffer_index = memory_variables['index']
        max_priority = memory_variables['max-priority']

        size_value, index_value, max_priority_value = test.read_variable_values(
            buffer_size, buffer_index, max_priority)

        # Assert indices 0 before insert.
        self.assertEqual(size_value, 0)
        self.assertEqual(index_value, 0)
        self.assertEqual(max_priority_value, 1.0)

        # Insert one more element than capacity
        observation = self.record_space.sample(size=self.capacity + 1)
        test.test(("insert_records", observation), expected_outputs=None)

        size_value, index_value = test.read_variable_values(
            buffer_size, buffer_index)
        # Size should be equivalent to capacity when full.
        self.assertEqual(size_value, self.capacity)

        # Index should be one over capacity due to modulo.
        self.assertEqual(index_value, 1)

    def test_batch_retrieve(self):
        """
        Tests if retrieval correctly manages capacity.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Insert 2 Elements.
        observation = non_terminal_records(self.record_space, 2)
        test.test(("insert_records", observation), expected_outputs=None)

        # Assert we can now fetch 2 elements.
        num_records = 2
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        print('Result batch = {}'.format(records))
        self.assertEqual(2, len(records['terminals']))

        # We allow repeat indices in sampling.
        num_records = 5
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        self.assertEqual(5, len(records['terminals']))

        # Now insert over capacity, note all elements here are non-terminal.
        observation = non_terminal_records(self.record_space, self.capacity)
        test.test(("insert_records", observation), expected_outputs=None)

        # Assert we can fetch exactly capacity elements.
        num_records = self.capacity
        batch = test.test(("get_records", num_records), expected_outputs=None)
        records = batch[0]
        self.assertEqual(self.capacity, len(records['terminals']))

    def test_update_records(self):
        """
        Tests update records logic.
        """
        memory = PrioritizedReplay(capacity=self.capacity)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)

        # Insert a few Elements.
        observation = non_terminal_records(self.record_space, 2)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch elements and their indices.
        num_records = 2
        batch = test.test(("get_records", num_records), expected_outputs=None)
        indices = batch[1]
        self.assertEqual(num_records, len(indices))
        # 0.3, 0.5, 1.0])
        input_params = [indices, np.asarray([0.1, 0.2])]
        # Does not return anything
        test.test(("update_records", input_params), expected_outputs=None)

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = PrioritizedReplay(capacity=self.capacity,
                                   alpha=self.alpha,
                                   beta=self.beta)
        test = ComponentTest(component=memory, input_spaces=self.input_spaces)
        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        memory_variables = memory.get_variables(
            ["sum-segment-tree", "min-segment-tree"], global_scope=False)
        sum_segment_tree = memory_variables['sum-segment-tree']
        min_segment_tree = memory_variables['min-segment-tree']
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float('inf'))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)
        # Insert 1 Element.
        observation = non_terminal_records(self.record_space, 1)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch segment tree.
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = non_terminal_records(self.record_space, 1)
        test.test(("insert_records", observation), expected_outputs=None)

        # Fetch segment tree.
        sum_segment_values, min_segment_values = test.read_variable_values(
            sum_segment_tree, min_segment_tree)
        print(sum_segment_values)
        print(min_segment_values)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)