Example #1
0
 def __init__(self, config, name=None):
     super().__init__(config, name)
     self.Phi = Preprocessor.make(config.preprocessor)
     self.x = self.Phi(Space.make(
         config.state_space).with_batch())  # preprocessed states (x)
     self.a = Space.make(config.action_space).with_batch()  # actions (a)
     self.Q = Network.make(
         network=config.q_network,
         input_space=self.x,
         output_space=Dict(
             A=self.a, V=Float().with_batch()),  # dueling network outputs
         adapters=dict(A=dict(pre_network=config.dueling_a_network),
                       V=dict(pre_network=config.dueling_v_network)))
     self.Qt = self.Q.copy(trainable=False)
     self.memory = PrioritizedReplayBuffer.make(
         record_space=Dict(dict(s=self.x, a=self.a, r=float, t=bool, n=int),
                           main_axes="B"),
         capacity=config.memory_capacity,
         alpha=config.memory_alpha,
         beta=config.memory_beta,
         next_record_setup=dict(s="s_", n_step=config.n_step))
     self.n_step = NStep(config.gamma,
                         n_step=config.n_step,
                         n_step_only=True)  # N-step component
     self.L = DDDQNLoss()  # double/dueling/n-step Q-loss
     self.optimizer = Optimizer.make(self.config.optimizer)
     self.epsilon = Decay.make(
         self.config.epsilon)  # for epsilon greedy learning
     self.Phi.reset()  # make sure, Preprocessor is clean
Example #2
0
    def __init__(self, config, name=None):
        super().__init__(config, name)
        self.inference = False  # True=planning mode. False="supervised+intrinsic-reward+model-learning" mode.
        self.he = 0  # Current step within He (total episode horizon).
        self.hz = 0  # Current step within Hz (repeat horizon for one selected skill)

        self.preprocessor = Preprocessor.make(config.preprocessor)
        self.s = self.preprocessor(
            config.state_space.with_batch())  # preprocessed states
        self.a = config.action_space.with_batch()  # actions (a)
        self.ri = Float(main_axes=[("Episode Horizon", config.episode_horizon)
                                   ])  # intrinsic rewards in He
        self.z = Float(-1.0, 1.0, shape=(config.dim_skill_vectors,), main_axes="B") if \
            config.discrete_skills is False else Int(config.dim_skill_vectors, main_axes="B")
        self.s_and_z = Dict(dict(s=self.s, z=self.z), main_axes="B")
        self.pi = Network.make(input_space=self.s_and_z,
                               output_space=self.a,
                               **config.policy_network)
        self.q = Network.make(input_space=self.s_and_z,
                              output_space=self.s,
                              distributions=dict(
                                  type="mixture",
                                  num_experts=config.num_q_experts),
                              **config.q_network)
        self.B = FIFOBuffer(Dict(dict(s=self.s, z=self.z, a=self.a, t=bool)),
                            config.episode_buffer_capacity,
                            when_full=self.event_buffer_full,
                            next_record_setup=dict(s="s_"))
        self.SAC = SAC(config=config.sac_config,
                       name="SAC-level0")  # Low-level SAC.
        self.q_optimizer = Optimizer.make(
            config.supervised_optimizer)  # supervised model optimizer
        self.Lsup = NegLogLikelihoodLoss(distribution=MixtureDistribution(
            num_experts=config.num_q_experts))
        self.preprocessor.reset()
Example #3
0
    def __init__(self, config, name=None):
        super().__init__(config, name)
        self.preprocessor = Preprocessor.make(config.preprocessor)
        self.s = self.preprocessor(Space.make(config.state_space).with_batch())  # preprocessed states (x)
        self.a = Space.make(config.action_space).with_batch()  # actions (a)
        self.a_soft = self.a.as_one_hot_float_space()  # soft-one-hot actions (if Int elements in action space)
        self.pi = Network.make(distributions=dict(  # policy (π)
            bounded_distribution_type=config.bounded_distribution_type, discrete_distribution_type="gumbel-softmax",
            gumbel_softmax_temperature=config.gumbel_softmax_temperature
        ), input_space=self.s, output_space=self.a, **config.policy_network)
        self.Q = []  # the Q-networks
        for i in range(config.num_q_networks):
            self.Q.append(Network.make(input_space=Dict(s=self.s, a=self.a), output_space=float, **config.q_network))
        self.Qt = [self.Q[i].copy(trainable=False) for i in range(config.num_q_networks)]  # target q-network(s)
        record_space = Dict(default_dict(dict(s=self.s, a=self.a_soft, r=float, t=bool),
                                         {"n": int} if config.n_step > 1 else {}), main_axes="B")
        self.memory = Memory.make(record_space=record_space, **config.memory_spec)
        self.alpha = tf.Variable(config.initial_alpha, name="alpha", dtype=tf.float32)  # the temperature parameter α
        self.entropy_target = Decay.make(config.entropy_target)
        self.n_step = NStep(config.gamma, n_step=config.n_step, n_step_only=True)
        self.L, self.Ls_critic, self.L_actor, self.L_alpha = SACLoss(), [0, 0], 0, 0  # SAC loss function and values.

        # TEST
        self.log_pi, self.entropy_error_term, self.log_alpha = 0, 0, 0
        # END: TEST

        self.optimizers = dict(
            q=Optimizer.make(self.config.q_optimizer), pi=Optimizer.make(self.config.policy_optimizer),
            alpha=Optimizer.make(self.config.alpha_optimizer)
        )
        self.preprocessor.reset()  # make sure, Preprocessor is clean
Example #4
0
class TestFIFOBufferMemory(unittest.TestCase):
    """
    Tests the FIFOBuffer Component.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=float),
                        rewards=float,
                        terminals=bool,
                        main_axes="B")
    capacity = 10

    def when_full(self, buffer):
        print("Executing `when_full` on buffer={}".format(buffer))
        raise Exception  # to catch

    def test_fifo_buffer(self):
        fifo_buffer = FIFOBuffer(record_space=self.record_space,
                                 capacity=self.capacity,
                                 when_full=self.when_full)

        # Not full.
        data = self.record_space.sample(self.capacity - 1)
        fifo_buffer.add_records(data)
        self.assertTrue(fifo_buffer.size == self.capacity - 1)

        # Full.
        data = self.record_space.sample(2)
        try:
            fifo_buffer.add_records(data)
            # Expect when_full to be called.
            raise AssertionError
        except Exception:
            pass

        self.assertTrue(fifo_buffer.size == self.capacity)
        all_data = fifo_buffer.flush()
        self.assertTrue(fifo_buffer.size == 0)
        self.assertTrue(fifo_buffer.index == 0)

        self.assertTrue(len(all_data["states"]["state1"]) == self.capacity)
        self.assertTrue(len(all_data["states"]["state2"]) == self.capacity)
        self.assertTrue(len(all_data["rewards"]) == self.capacity)
        self.assertTrue(all_data["rewards"].dtype == np.float32)
Example #5
0
class TestReplayBuffer(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the replay_memory module.
    """
    record_space = Dict(states=dict(state1=float, state2=float),
                        actions=dict(action1=Int(3, shape=(3, ))),
                        reward=float,
                        terminals=Bool(),
                        next_states=dict(state1=float, state2=float),
                        main_axes="B")

    def test_insert(self):
        """
        Simply tests insert op without checking internal logic.
        """
        memory = ReplayBuffer(record_space=self.record_space, capacity=4)
        # Assert indices 0 before insert.
        self.assertEqual(memory.size, 0)
        self.assertEqual(memory.index, 0)

        # Insert one single record (no batch rank) and check again.
        data = self.record_space.sample()
        memory.add_records(data)
        self.assertTrue(memory.size == 1)
        self.assertTrue(memory.index == 1)

        # Insert one single record (with batch rank) and check again.
        data = self.record_space.sample(1)
        memory.add_records(data)
        self.assertTrue(memory.size == 2)
        self.assertTrue(memory.index == 2)

        # Insert two records (batched).
        data = self.record_space.sample(2)
        memory.add_records(data)
        self.assertTrue(memory.size == 4)
        self.assertTrue(memory.index == 0)

        # Insert one single record (no batch rank, BUT with `single` indicator set for performance reasons)
        # and check again.
        data = self.record_space.sample()
        memory.add_records(data, single=True)
        self.assertTrue(memory.size == 4)
        self.assertTrue(memory.index == 1)

    def test_insert_over_capacity(self):
        """
        Tests if insert correctly manages capacity.
        """
        capacity = 10
        memory = ReplayBuffer(record_space=self.record_space,
                              capacity=capacity)
        # Assert indices 0 before insert.
        self.assertEqual(memory.size, 0)
        self.assertEqual(memory.index, 0)

        # Insert one more element than capacity.
        data = self.record_space.sample(size=capacity + 1)
        memory.add_records(data)

        # Size should be equivalent to capacity when full.
        self.assertEqual(memory.size, capacity)
        # Index should be one over capacity due to modulo.
        self.assertEqual(memory.index, 1)

    def test_get_records(self):
        """
        Tests if retrieval correctly manages capacity.
        """
        capacity = 10
        memory = ReplayBuffer(record_space=self.record_space,
                              capacity=capacity)

        # Insert 1 record.
        data = self.record_space.sample(1)
        memory.add_records(data)

        # Assert we can now fetch 2 elements.
        retrieved_data = memory.get_records(num_records=1)
        self.assertEqual(1, len(retrieved_data["terminals"]))
        check(data, retrieved_data)

        # Test duplicate sampling.
        retrieved_data = memory.get_records(num_records=5)
        self.assertEqual(5, len(retrieved_data["terminals"]))
        # Only one record in the memory -> returned samples should all be the exact same.
        check(retrieved_data["reward"][0], retrieved_data["reward"][1])
        check(retrieved_data["reward"][0], retrieved_data["reward"][2])
        check(retrieved_data["reward"][0], retrieved_data["reward"][3])
        check(retrieved_data["reward"][0], retrieved_data["reward"][4])

        # Now insert another one.
        data = self.record_space.sample()  # w/o batch rank
        memory.add_records(data)
        # Pull exactly two records and make sure they are NOT(!) the same.
        retrieved_data = memory.get_records(num_records=2)
        self.assertEqual(2, len(retrieved_data["terminals"]))
        self.assertNotEqual(retrieved_data["reward"][0],
                            retrieved_data["reward"][1])

        # Now insert over capacity.
        data = self.record_space.sample(capacity)
        memory.add_records(data)

        # Assert we can fetch exactly capacity elements.
        retrieved_data = memory.get_records(num_records=capacity)
        self.assertEqual(capacity, len(retrieved_data["terminals"]))
class TestMemoriesGenerically(unittest.TestCase):
    """
    Tests different generic functionalities of Memories.
    """
    record_space = Dict(
        states=dict(state1=float, state2=Float(shape=(2,))),
        actions=dict(action1=int),
        reward=float,
        terminals=bool,
        main_axes="B"
    )
    record_space_no_next_state = Dict(s=dict(s1=float, s2=float), a=dict(a1=Int(10)), r=float, t=Bool(), main_axes="B")

    capacity = 10
    alpha = 1.0
    beta = 1.0
    max_priority = 1.0

    def test_next_state_handling(self):
        """
        Tests if next-states can be stored efficiently (not using any space!) in the memory.

        NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state
        regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a
        terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random
        state) does not matter anyway.
        """
        capacity = 10
        batch_size = 2

        # Test all classes of memories.
        for class_ in [ReplayBuffer, PrioritizedReplayBuffer]:
            memory = class_(record_space=self.record_space_no_next_state, capacity=capacity,
                            next_record_setup=dict(s="s_"))

            # Insert n records (inserts must always be batch-size).
            data = dict(
                s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])),
                a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]),
                s_=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1]))
            )
            memory.add_records(data)

            # Check, whether inserting the wrong batch size raises Exception.
            try:
                data = self.record_space_no_next_state.sample(batch_size + 1)
                data["s_"] = self.record_space_no_next_state["s"].sample(batch_size)
                memory.add_records(data)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Assert we can now fetch n elements.
            retrieved_data = memory.get_records(num_records=1)
            self.assertEqual(1, len(retrieved_data["t"]))

            # Check the next state.
            if retrieved_data["s"]["s1"][0] == 0.0:
                self.assertTrue(retrieved_data["s_"]["s1"] == 0.1 and retrieved_data["s_"]["s2"] == 2.1)
            else:
                self.assertTrue(retrieved_data["s"]["s1"] == 1.0)
                self.assertTrue(retrieved_data["s_"]["s1"] == 1.1 and retrieved_data["s_"]["s2"] == 3.1)

            # Insert another 2xn records and then check for correct next-state returns when getting records.
            data = dict(
                s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])),
                a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])),
                a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=2)
                self.assertEqual(2, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(2):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 6)

            # Insert up to capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])),
                a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]),
                s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])),
                a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=3)
                self.assertEqual(3, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(3):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)

            # Go a little bit (one batch) over capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=4)
                self.assertEqual(4, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(4):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)

    def test_next_state_handling_with_n_step(self):
        """
        Tests if next-states can be stored efficiently (not using any space!) in the memory using an n-step memory.

        NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state
        regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a
        terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random
        state) does not matter anyway.
        """
        capacity = 10
        batch_size = 2
        # Test all classes of memories.
        for class_ in [ReplayBuffer, PrioritizedReplayBuffer]:
            memory = class_(record_space=self.record_space_no_next_state, capacity=capacity,
                            next_record_setup=dict(s="s_", n_step=3))

            # Insert n records (inserts must always be batch-size).
            data = dict(
                s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])),
                a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]),
                s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3]))  # s' is now the n-step s'
            )
            memory.add_records(data)

            # Check, whether inserting the wrong batch size raises Exception.
            try:
                data = self.record_space_no_next_state.sample(batch_size + 1)
                data["s_"] = self.record_space_no_next_state["s"].sample(batch_size)
                memory.add_records(data)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Assert we cannot pull samples yet. n-step is 3, so we need at least 3 elements in memory.
            try:
                memory.get_records(num_records=1)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Insert another 2xn records and then check for correct next-state returns when getting records.
            data = dict(
                s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])),
                a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4]))  # s' is now the n-step s'
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])),
                a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5]))  # s' is now the n-step s'
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=2)
                self.assertEqual(2, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(2):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 6)

            # Insert up to capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])),
                a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]),
                s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])),
                a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.7, 1.7]), s2=np.array([2.7, 3.7]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=3)
                self.assertEqual(3, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(3):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 10)

            # Go a little bit (two batches) over capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.8, 1.8]), s2=np.array([2.8, 3.8]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.9, 1.9]), s2=np.array([2.9, 3.9]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=4)
                self.assertEqual(4, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(4):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.3)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.3)

            self.assertTrue(memory.size == 10)
Example #7
0
class TestPrioritizedReplayBuffer(unittest.TestCase):
    """
    Tests insertion and (weighted) sampling of the PrioritizedReplayBuffer Component.
    """
    record_space = Dict(
        states=dict(state1=float, state2=Float(shape=(2,))),
        actions=dict(action1=int),
        reward=float,
        terminals=bool,
        main_axes="B"
    )

    capacity = 10
    alpha = 1.0
    beta = 1.0
    max_priority = 1.0

    def test_insert(self):
        memory = PrioritizedReplayBuffer(
            record_space=self.record_space,
            capacity=self.capacity,
            alpha=self.alpha,
            beta=self.beta
        )

        # Assert indices 0 before insert.
        self.assertEqual(memory.size, 0)
        self.assertEqual(memory.index, 0)

        # Insert single record (no batch rank).
        data = self.record_space.sample()
        memory.add_records(data)
        self.assertTrue(memory.size == 1)
        self.assertTrue(memory.index == 1)

        # Insert single record (w/ batch rank).
        data = self.record_space.sample(1)
        memory.add_records(data)
        self.assertTrue(memory.size == 2)
        self.assertTrue(memory.index == 2)

        # Insert batched records.
        data = self.record_space.sample(5)
        memory.add_records(data)
        self.assertTrue(memory.size == 7)
        self.assertTrue(memory.index == 7)

        # Insert over capacity.
        data = self.record_space.sample(100)
        memory.add_records(data)
        self.assertTrue(memory.size == 10)
        self.assertTrue(memory.index == 7)

    def test_update_records(self):
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=self.capacity)

        # Insert record samples.
        num_records = 2
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Fetch records, their indices and weights.
        batch, indices, weights = memory.get_records_with_indices_and_weights(num_records)
        check(weights, np.ones(shape=(num_records,)))
        self.assertEqual(num_records, len(indices))
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Update weight of index 0 to very small.
        memory.update_records(np.array([0]), np.array([0.01]))
        # Expect to sample almost only index 1 (which still has a weight of 1.0).
        for _ in range(100):
            _, indices, weights = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 970)

        # Update weight of index 1 to very small as well.
        # Expect to sample equally.
        for _ in range(100):
            rand = np.random.random()
            memory.update_records(np.array([0, 1]), np.array([rand, rand]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 400)
            self.assertLessEqual(np.sum(indices), 600)

        # Update weights to be 1:2.
        # Expect to sample double as often index 1 over index 0 (1.0 = 2* 0.5).
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 2]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 600)
            self.assertLessEqual(np.sum(indices), 750)

        # Update weights to be 1:4.
        # Expect to sample quadruple as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 4]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 750)
            self.assertLessEqual(np.sum(indices), 850)

        # Update weights to be 1:9.
        # Expect to sample 9 times as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 9]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 850)
            self.assertLessEqual(np.sum(indices), 950)

        # Insert more record samples.
        num_records = 10
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == self.capacity)
        self.assertTrue(memory.index == 2)

        # Update weights to be 1.0 to 10.0 and sample a < 10 batch.
        memory.update_records(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                              np.array([0.1, 1., 3., 8., 16., 32., 64., 128., 256., 512.]))
        counts = Counter()
        for _ in range(1000):
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=np.random.randint(1, 6))
            for i in indices:
                counts[i] += 1
        print(counts)
        self.assertTrue(
            counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >=
            counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0]
        )

    def test_segment_tree_insert_values(self):
        """
        Tests if segment tree inserts into correct positions.
        """
        memory = PrioritizedReplayBuffer(
            record_space=self.record_space,
            capacity=self.capacity,
            alpha=self.alpha,
            beta=self.beta
        )

        priority_capacity = 1
        while priority_capacity < self.capacity:
            priority_capacity *= 2

        sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values
        min_segment_values = memory.merged_segment_tree.min_segment_tree.values

        self.assertEqual(sum(sum_segment_values), 0)
        self.assertEqual(sum(min_segment_values), float("inf"))
        self.assertEqual(len(sum_segment_values), 2 * priority_capacity)
        self.assertEqual(len(min_segment_values), 2 * priority_capacity)

        # Insert 1 Element.
        observation = self.record_space.sample(size=1)
        memory.add_records(observation)

        # Check insert positions
        # Initial insert is at priority capacity
        print(sum_segment_values)
        print(min_segment_values)
        start = priority_capacity

        while start >= 1:
            self.assertEqual(sum_segment_values[start], 1.0)
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

        # Insert another Element.
        observation = self.record_space.sample(size=1)
        memory.add_records(observation)

        # Index shifted 1
        start = priority_capacity + 1
        self.assertEqual(sum_segment_values[start], 1.0)
        self.assertEqual(min_segment_values[start], 1.0)
        start = int(start / 2)
        while start >= 1:
            # 1 + 1 is 2 on the segment.
            self.assertEqual(sum_segment_values[start], 2.0)
            # min is still 1.
            self.assertEqual(min_segment_values[start], 1.0)
            start = int(start / 2)

    def test_tree_insert(self):
        """
        Tests inserting into the segment tree and querying segments.
        """
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4        )
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertTrue(np.isclose(tree.get_sum(), 4.0))
        self.assertTrue(np.isclose(tree.get_sum(0, 2), 0.0))
        self.assertTrue(np.isclose(tree.get_sum(0, 3), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, 3), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, -1), 1.0))
        self.assertTrue(np.isclose(tree.get_sum(2, 4), 4.0))

    def test_prefixsum_idx(self):
        """
        Tests fetching the index corresponding to a prefix sum.
        """
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)

        self.assertEqual(tree.index_of_prefixsum(0.0), 2)
        self.assertEqual(tree.index_of_prefixsum(0.5), 2)
        self.assertEqual(tree.index_of_prefixsum(0.99), 2)
        self.assertEqual(tree.index_of_prefixsum(1.01), 3)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(4.0), 3)

        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=4)
        tree = memory.merged_segment_tree.sum_segment_tree
        tree.insert(0, 0.5)
        tree.insert(1, 1.0)
        tree.insert(2, 1.0)
        tree.insert(3, 3.0)
        self.assertEqual(tree.index_of_prefixsum(0.0), 0)
        self.assertEqual(tree.index_of_prefixsum(0.55), 1)
        self.assertEqual(tree.index_of_prefixsum(0.99), 1)
        self.assertEqual(tree.index_of_prefixsum(1.51), 2)
        self.assertEqual(tree.index_of_prefixsum(3.0), 3)
        self.assertEqual(tree.index_of_prefixsum(5.50), 3)
Example #8
0
    def __init__(self,
                 world="2x2",
                 actors=1,
                 num_cores=1,
                 save_mode=False,
                 action_type="udlr",
                 reward_function="sparse",
                 state_representation="discrete"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a
                discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete
                action space. "ftjb" is the same as "ftj", except that sub-action "jump" is a boolean.

            reward_function (str): One of
                sparse: hole=-5, fire=-3, goal=1, all other steps=-0.1
                rich: hole=-100, fire=-10, goal=50, all other steps=-0.1

            state_representation (str):
                - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one
                    below, etc..
                - "xy": The x and y grid position tuple.
                - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values
                    of the actor.
                - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are
                    used to indicate different items in the scene (walls, holes, the actor, etc..).
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        n_row, n_col = world.shape

        # Figure out our state space.
        assert state_representation in [
            "discrete", "xy", "xy+orientation", "camera"
        ]
        # Discrete states (single int from 0 to n).
        if state_representation == "discrete":
            state_space = Int(n_row * n_col)
        # x/y position (2 ints).
        elif state_representation == "xy":
            state_space = Int(low=(0, 0), high=(n_col, n_row), shape=(2, ))
        # x/y position + orientation (3 ints).
        elif state_representation == "xy+orientation":
            state_space = Int(low=(0, 0, 0, 0), high=(n_col, n_row, 1, 1))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = Int(0, 255, shape=(n_row, n_col, 3))

        # Specify the actual action space.
        action_space = Int(4) if action_type == "udlr" else Dict(
            dict(forward=Int(3),
                 turn=Int(3),
                 jump=(Int(2) if action_type == "ftj" else Bool())))
        # Call super.
        super().__init__(
            actors=actors,
            num_cores=num_cores,
            state_space=state_space,
            action_space=action_space,
            process_class=GridWorldEnvProcess,
            # kwargs passed on to process ctor.
            world=world,
            n_col=n_col,
            n_row=n_row,
            state_representation=state_representation,
            action_type=action_type,
            reward_function=reward_function,
        )
        # Buffers for returns from processes.
        self.state = np.array([state_space.zeros()] * actors)
        # Reset ourselves.
        self.reset_all()
Example #9
0
    def test_joint_cumulative_distribution(self):
        param_space = Dict(
            {
                "a":
                Float(shape=(4, )),  # 4-discrete
                "b":
                Dict({
                    "ba":
                    Tuple([Float(shape=(3, )),
                           Float(0.1, 1.0, shape=(3, ))]),  # 3-variate normal
                    "bb":
                    Tuple([Float(shape=(2, )),
                           Float(shape=(2, ))]),  # beta -1 to 1
                    "bc":
                    Tuple([Float(shape=(4, )),
                           Float(0.1, 1.0, shape=(4, ))]),  # normal (dim=4)
                })
            },
            main_axes="B")

        values_space = Dict(
            {
                "a":
                Int(4),
                "b":
                Dict({
                    "ba": Float(shape=(3, )),
                    "bb": Float(shape=(2, )),
                    "bc": Float(shape=(4, ))
                })
            },
            main_axes="B")

        low, high = -1.0, 1.0
        cumulative_distribution = JointCumulativeDistribution(
            distributions={
                "a": Categorical(),
                "b": {
                    "ba": MultivariateNormal(),
                    "bb": Beta(low=low, high=high),
                    "bc": Normal()
                }
            })

        # Batch of size=2 and deterministic (True).
        input_ = param_space.sample(2)
        input_["a"] = softmax(input_["a"])
        expected_mean = {
            "a": np.argmax(input_["a"], axis=-1),
            "b": {
                "ba":
                input_["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_["b"]["bc"][0],
            }
        }
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(20):
            out = cumulative_distribution.sample(input_, deterministic=True)
            check(out, expected_mean)
            out = cumulative_distribution.sample_deterministic(input_)
            check(out, expected_mean)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        input_["a"] = softmax(input_["a"])
        expected_mean = {
            "a": np.sum(input_["a"] * np.array([0, 1, 2, 3])),
            "b": {
                "ba":
                input_["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 / (1.0 + input_["b"]["bb"][1] / input_["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_["b"]["bc"][0],
            }
        }

        outs = []
        for _ in range(500):
            out = cumulative_distribution.sample(input_)
            outs.append(out)
            out = cumulative_distribution.sample_stochastic(input_)
            outs.append(out)

        check(np.mean(np.stack([o["a"][0] for o in outs], axis=0), axis=0),
              expected_mean["a"],
              atol=0.3)
        check(np.mean(np.stack([o["b"]["ba"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["ba"][0],
              decimals=1)
        check(np.mean(np.stack([o["b"]["bb"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["bb"][0],
              decimals=1)
        check(np.mean(np.stack([o["b"]["bc"][0] for o in outs], axis=0),
                      axis=0),
              expected_mean["b"]["bc"][0],
              decimals=1)

        # Test log-likelihood outputs.
        params = param_space.sample(1)
        params["a"] = softmax(params["a"])
        # Make sure beta-values are within 0.0 and 1.0 for the numpy calculation (which doesn't have scaling).
        values = values_space.sample(1)
        log_prob_beta = np.log(
            beta.pdf(values["b"]["bb"], params["b"]["bb"][0],
                     params["b"]["bb"][1]))
        # Now do the scaling for b/bb (beta values).
        values["b"]["bb"] = values["b"]["bb"] * (high - low) + low
        expected_log_llh = np.log(params["a"][0][values["a"][0]]) + \
            np.sum(np.log(norm.pdf(values["b"]["ba"][0], params["b"]["ba"][0], params["b"]["ba"][1]))) + \
            np.sum(log_prob_beta) + \
            np.sum(np.log(norm.pdf(values["b"]["bc"][0], params["b"]["bc"][0], params["b"]["bc"][1])))

        out = cumulative_distribution.log_prob(params, values)
        check(out, expected_log_llh, decimals=0)
Example #10
0
    def test_mixture(self):
        # Create a mixture distribution consisting of 3 bivariate normals weighted by an internal
        # categorical distribution.
        num_distributions = 3
        num_events_per_multivariate = 2  # 2=bivariate
        param_space = Dict(
            {
                "categorical":
                Float(shape=(num_distributions, ), low=-1.5, high=2.3),
                "parameters0":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
                "parameters1":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
                "parameters2":
                Tuple(
                    Float(shape=(num_events_per_multivariate, )),  # mean
                    Float(shape=(num_events_per_multivariate, ),
                          low=0.5,
                          high=1.0),  # diag
                ),
            },
            main_axes="B")
        values_space = Float(shape=(num_events_per_multivariate, ),
                             main_axes="B")
        # The Component to test.
        mixture = MixtureDistribution(
            # Try different spec types.
            MultivariateNormal(),
            "multi-variate-normal",
            "multivariate_normal")

        # Batch of size=n and deterministic (True).
        input_ = param_space.sample(1)
        # Make probs for categorical.
        categorical_probs = softmax(input_["categorical"])

        # Note: Usually, the deterministic draw should return the max-likelihood value
        # Max-likelihood for a 3-Mixed Bivariate: mean-of-argmax(categorical)()
        # argmax = np.argmax(input_[0]["categorical"], axis=-1)
        #expected = np.array([input_[0]["parameters{}".format(idx)][0][i] for i, idx in enumerate(argmax)])
        #    input_[0]["categorical"][:, 1:2] * input_[0]["parameters1"][0] + \
        #    input_[0]["categorical"][:, 2:3] * input_[0]["parameters2"][0]

        # The mean value is a 2D vector (bivariate distribution).
        expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_["parameters2"][0]

        for _ in range(20):
            out = mixture.sample(input_, deterministic=True)
            check(out, expected)
            out = mixture.sample_deterministic(input_)
            check(out, expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = param_space.sample(1)
        # Make probs for categorical.
        categorical_probs = softmax(input_["categorical"])
        expected = categorical_probs[:, 0:1] * input_["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_["parameters2"][0]
        outs = []
        for _ in range(500):
            out = mixture.sample(input_, deterministic=False)
            outs.append(out)
            out = mixture.sample_stochastic(input_)
            outs.append(out)
        check(np.mean(np.array(outs), axis=0), expected, decimals=1)

        return
        # TODO: prob/log-prob tests for Mixture.

        # Test log-likelihood outputs (against scipy).
        for i in range(20):
            params = param_space.sample(1)
            # Make sure categorical params are softmaxed.
            category_probs = softmax(params["categorical"][0])
            values = values_space.sample(1)
            expected = 0.0
            v = []
            for j in range(3):
                v.append(
                    multivariate_normal.pdf(
                        values[0],
                        mean=params["parameters{}".format(j)][0][0],
                        cov=params["parameters{}".format(j)][1][0]))
                expected += category_probs[j] * v[-1]
            out = mixture.prob(params, values)
            check(out[0], expected, atol=0.1)

            expected = np.zeros(shape=(3, ))
            for j in range(3):
                expected[j] = np.log(category_probs[j]) + np.log(
                    multivariate_normal.pdf(
                        values[0],
                        mean=params["parameters{}".format(j)][0][0],
                        cov=params["parameters{}".format(j)][1][0]))
            expected = np.log(np.sum(np.exp(expected)))
            out = mixture.log_prob(params, values)
            print("{}: out={} expected={}".format(i, out, expected))
            check(out, np.array([expected]), atol=0.25)
Example #11
0
    def __init__(
            self,
            *,
            policy_network,
            q_network,
            state_space,
            action_space,
            sac_config,
            num_q_experts=4,  # 4 used in paper.
            q_predicts_states_diff=False,
            num_denominator_samples_for_ri=250,  # 50-500 used in paper
            dim_skill_vectors=10,
            discrete_skills=False,
            episode_horizon=200,
            skill_horizon=None,
            preprocessor=None,
            supervised_optimizer=None,
            num_steps_per_supervised_update=1,
            episode_buffer_capacity=200,
            summaries=None):
        """
        Args:
            policy_network (Network): The policy-network (pi) to use as a function approximator for the learnt policy.

            q_network (Network): The dynamics-network (q) to use as a function approximator for the learnt env
                dynamics. NOTE: Not to be confused with a Q-learning Q-net! In the paper, the dynamics function is
                called `q`, hence the same nomenclature here.

            state_space (Space): The state/observation Space.
            action_space (Space): The action Space.
            sac_config (SACConfig): The config for the internal SAC-Algo used to learn the skills using intrinsic rewards.

            num_q_experts (int): The number of experts used in the Mixture distribution output bz the q-network to
                predict the next state (s') given s (state) and z (skill vector).

            q_predicts_states_diff (bool): Whether the q-network predicts the different between s and s' rather than
                s' directly. Default: False.

            num_denominator_samples_for_ri (int): The number of samples to calculate for the denominator of the
                intrinsic reward function (`L` in the paper).

            dim_skill_vectors (int): The number of dimensions of the learnt skill vectors.
            discrete_skills (bool): Whether skill vectors are discrete (one-hot).
            episode_horizon (int): The episode horizon (He) to move within, when gathering episode samples.

            skill_horizon (Optional[int]): The horizon for which to use one skill vector (before sampling a new one).
                Default: Use value of `episode_horizon`.

            preprocessor (Preprocessor): The preprocessor (if any) to use.
            supervised_optimizer (Optimizer): The optimizer to use for the supervised (q) model learning task.

            num_steps_per_supervised_update (int): The number of gradient descent iterations per update
                (each iteration uses the same environment samples).

            episode_buffer_capacity (int): The capacity of the episode (experience) FIFOBuffer.

            summaries (List[any]): A list of summaries to produce if `UseTfSummaries` in debug.json is true.
                In the simplest case, this is a list of `self.[...]`-property names of the SAC object that should
                be tracked after each tick.
        """
        # Clean up network configs to be passable as **kwargs to `make`.
        # Networks are given as sequential config or directly as Keras objects -> prepend "network" key to spec.
        if isinstance(
                policy_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            policy_network = dict(network=policy_network)
        if isinstance(
                q_network,
            (list, tuple, tf.keras.models.Model, tf.keras.layers.Layer)):
            q_network = dict(network=q_network)

        # Make state/action space.
        state_space = Space.make(state_space)
        action_space = Space.make(action_space)

        # Fix SAC config, add correct state- and action-spaces.
        sac_config = SACConfig.make(
            sac_config,
            state_space=Dict(s=state_space,
                             z=Float(-1.0, 1.0, shape=(dim_skill_vectors, ))),
            action_space=action_space,
            # Use no memory. Updates are done from DADS' own buffer.
            memory_capacity=1,
            memory_batch_size=1,
            # Share policy network between DADS and underlying learning SAC.
            policy_network=policy_network)

        if skill_horizon is None:
            skill_horizon = episode_horizon

        super().__init__(
            locals())  # Config will store all c'tor variables automatically.

        # Keep track of which time-step stuff happened. Only important for by-time-step frequencies.
        self.last_update = 0