コード例 #1
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_linear_parameter_using_global_time_step(self):
     max_time_steps = 100
     linear_decay = Decay.make("linear-decay", from_=2.0, to_=0.5, max_time_steps=max_time_steps)
     # Call without any parameters -> force component to use GLOBAL_STEP, which should be 0 right now -> no decay.
     for time_step in range(30):
         out = linear_decay()
         check(out, 2.0 - (time_step / max_time_steps) * (2.0 - 0.5))
コード例 #2
0
    def test_dqn2015_loss_function(self):
        # Batch of size=2.
        input_ = {
            "x": np.random.random(size=(2, 2)),  # states don't matter for this test as Q-funcs are faked.
            "a": np.array([0, 1]),
            "r": np.array([9.4, -1.23]),
            "t": np.array([False, False]),
            "x_": np.random.random(size=(2, 2))  # states don't matter for this test as Q-funcs are faked.
        }
        # Fake q-nets. Just have to be callables, returning some q-values.
        q_net = lambda s, a: np.array([10.0, -90.6])
        target_q_net = lambda s_: np.array([[12.0, -8.0], [22.3, 10.5]])

        """
        Calculation:
        batch of 2, gamma=1.0
        Qt(s'a') = [12 -8] [22.3 10.5] -> max(a') = [12] [22.3]
        Q(s,a)  = [10.0] [-90.6]
        L = E(batch)| 0.5((r + gamma max(a')Qt(s'a') ) - Q(s,a))^2 |
        L = (0.5(9.4 + 1.0*12 - 10.0)^2 + 0.5(-1.23 + 1.0*22.3 - -90.6)^2) / 2
        L = (0.5(129.96) + 0.5(12470.1889)) / 2
        L = (64.98 + 6235.09445) / 2
        L = 3150.037225
        """
        # Batch size=2 -> Expect 2 values returned by `loss_per_item`.
        expected_loss_per_item = np.array([64.979996, 6235.09445], dtype=np.float32)
        # Expect the mean over the batch.
        expected_loss = expected_loss_per_item.mean()
        out = DQN2015Loss()(input_, q_net, target_q_net, namedtuple("FakeDQN2015Config", ["gamma"])(gamma=1.0))
        check(out.numpy(), expected_loss, decimals=2)
コード例 #3
0
ファイル: model.py プロジェクト: rosea-tf/surreal
    def sync_from(self, other_model, tau=1.0):
        """
        Syncs all of this Model's weights from the `other_model` using the formula:
        new_weights = old_weights sync_tau

        [new weights] = tau * [other_model's weights] + (1.0 - tau) * [old weights]

        Args:
            other_model (Model): The other Model to sync from.
            tau (float): Teh tau parameter used for soft-syncing (see formula above).
        """
        other_values = other_model.get_weights(as_ref=False)

        if AssertModelSync is True:
            try:
                check(self.get_weights(), other_values)
                print("WARNING: model weights were equal.")
            except AssertionError:
                pass

        if tau == 1.0:
            self.set_weights(other_values)
        else:
            our_vars = self.get_weights(as_ref=True)
            for our_var, other_var in zip(our_vars, other_values):
                tf.keras.backend.set_value(
                    our_var, tau * other_var + (1.0 - tau) * our_var)

        if AssertModelSync is True:
            check(self.get_weights(), other_values)
コード例 #4
0
    def test_flatten_alongside(self):
        space = Dict(
            {
                "a":
                Float(shape=(4, )),
                "b":
                Dict({
                    "ba":
                    Tuple([Float(shape=(3, )),
                           Float(0.1, 1.0, shape=(3, ))]),
                    "bb":
                    Tuple([Float(shape=(2, )),
                           Float(shape=(2, ))]),
                    "bc":
                    Tuple([Float(shape=(4, )),
                           Float(0.1, 1.0, shape=(4, ))]),
                })
            },
            main_axes="B")
        # Flatten alongside this structure.
        alongside = dict(a=True, b=dict(ba=False, bc=None, bb="foo"))

        input_ = space.sample(2)
        # Expect to only flatten until ["b"]["ba/b/c"], not into the Tuples as `alongside` does not have these.
        out = flatten_alongside(input_, alongside)
        expected = [
            input_["a"], input_["b"]["ba"], input_["b"]["bb"],
            input_["b"]["bc"]
        ]

        check(out, expected)
コード例 #5
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_polynomial_parameter_using_global_time_step(self):
     max_time_steps = 10
     polynomial_decay = Decay.make("polynomial-decay", from_=3.0, to_=0.5, max_time_steps=max_time_steps)
     # Call without any parameters -> force component to use internal `current_time_step`.
     # Go over the max time steps and expect time_percentage to be capped at 1.0.
     for time_step in range(50):
         out = polynomial_decay()
         check(out, (3.0 - 0.5) * (1.0 - min(time_step / max_time_steps, 1.0)) ** 2 + 0.5)
コード例 #6
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_exponential_parameter_using_global_time_step(self):
     max_time_steps = 10
     decay_rate = 0.1
     exponential_decay = ExponentialDecay.make(
         from_=3.0, to_=0.5, max_time_steps=max_time_steps, decay_rate=decay_rate
     )
     # Call without any parameters -> force component to use internal `current_time_step`.
     # Go over the max time steps and expect time_percentage to be capped at 1.0.
     for time_step in range(100):
         out = exponential_decay()
         check(out, 0.5 + (3.0 - 0.5) * decay_rate ** min(time_step / max_time_steps, 1.0))
コード例 #7
0
ファイル: test_optimizers.py プロジェクト: rosea-tf/surreal
    def test_gradient_tape(self):
        # Var to optimize.
        var = tf.Variable(random.random())
        # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2
        expected_grad = 2 * var.numpy() - 2.0

        # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does
        # not work here.
        with tf.GradientTape() as t:
            loss = self.L(var)

        check(t.gradient(loss, var), expected_grad)
コード例 #8
0
    def test_dddqn_loss_function(self):
        """
        Tests the dueling/double q-loss function assuming an n-step of 1.
        """
        # Batch of size=2.
        input_ = {
            "s": np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]),
            "a": np.array([2, 1]),
            "r": np.array([10.3, -4.25]),
            "t": np.array([False, True]),
            # make s' distinguishable from s via its values for the fake q-net to notice.
            "s_": np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]),
            "n": np.array([1, 1])
        }

        # Fake q-nets. Just have to be callables.
        # The q-net is first called by the loss function with s', then with s. Detect this difference here and return
        # different q-values for the different states.
        q_net = lambda s: dict(A=np.array([[-12.3, 1.2, 1.4], [12.2, -11.5, 9.2]]) if s[0][0] == 1.0 else \
            np.array([[10.0, -10.0, 12.4], [-0.101, -4.6, -9.3]]), V=np.array([1.0, -2.0]))
        target_q_net = lambda s_: dict(A=np.array([[-10.3, 1.5, 1.4],
                                                   [8.2, -10.9, 9.3]]),
                                       V=np.array([0.1, -0.2]))
        """
        Calculation:
        batch of 2, gamma=0.9
        a' = [2 0]  <- argmax(a')Q(s'a')

        Qt(s'.) = 0.1+[-10.3, 1.5, 1.4]--2.4666(A-avg) -0.2+[8.2, -10.9, 9.3]-2.2(A-avg) -> Qt(s'a') = \ 
            [0.1+1.4+2.4666=3.9666] [0.0 <- terminal=True]

        a = [2 1]
        Q(s,a)  = 1.0+[12.4]-4.1333(A-avg) -2.0+[-4.6]--4.667(A-avg) = [9.2667 -1.933]

        L = E(batch)| 0.5((r + gamma Qt(s'( argmax(a') Q(s'a') )) ) - Q(s,a))^2 |
        L = (0.5(10.3 + 0.9*3.9666 - 9.2667)^2 + 0.5(-4.25 + 0.9*0.0 - -1.933)^2) / 2
        L = (0.5(4.60324)^2 + 0.5(-2.317)^2) / 2  <- td-errors are the numbers inside the (...)^2 brackets
        L = (21.1898184976 + 5.368489) / 4
        L = 26.5583074976 / 4 
        L = 6.6395768744
        """

        # Expect the mean over the batch.
        expected_loss = 6.6395768744
        expected_td_errors = [4.60333333, 2.317]  # absolute values
        out = DDDQNLoss()(input_, q_net, target_q_net,
                          namedtuple("FakeDDDQNConfig", ["gamma"])(gamma=0.9))
        check(out[0].numpy(), expected_loss, decimals=3)
        check(out[1].numpy(), expected_td_errors, decimals=2)
コード例 #9
0
    def test_get_records(self):
        """
        Tests if retrieval correctly manages capacity.
        """
        capacity = 10
        memory = ReplayBuffer(record_space=self.record_space,
                              capacity=capacity)

        # Insert 1 record.
        data = self.record_space.sample(1)
        memory.add_records(data)

        # Assert we can now fetch 2 elements.
        retrieved_data = memory.get_records(num_records=1)
        self.assertEqual(1, len(retrieved_data["terminals"]))
        check(data, retrieved_data)

        # Test duplicate sampling.
        retrieved_data = memory.get_records(num_records=5)
        self.assertEqual(5, len(retrieved_data["terminals"]))
        # Only one record in the memory -> returned samples should all be the exact same.
        check(retrieved_data["reward"][0], retrieved_data["reward"][1])
        check(retrieved_data["reward"][0], retrieved_data["reward"][2])
        check(retrieved_data["reward"][0], retrieved_data["reward"][3])
        check(retrieved_data["reward"][0], retrieved_data["reward"][4])

        # Now insert another one.
        data = self.record_space.sample()  # w/o batch rank
        memory.add_records(data)
        # Pull exactly two records and make sure they are NOT(!) the same.
        retrieved_data = memory.get_records(num_records=2)
        self.assertEqual(2, len(retrieved_data["terminals"]))
        self.assertNotEqual(retrieved_data["reward"][0],
                            retrieved_data["reward"][1])

        # Now insert over capacity.
        data = self.record_space.sample(capacity)
        memory.add_records(data)

        # Assert we can fetch exactly capacity elements.
        retrieved_data = memory.get_records(num_records=capacity)
        self.assertEqual(capacity, len(retrieved_data["terminals"]))
コード例 #10
0
    def test_dddqn_learning_on_grid_world_2x2(self):
        # Create an Env object.
        env = GridWorld("2x2", actors=1)

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories)
        )
        # Create a Config.
        dqn_config = DDDQNConfig.make(
            "{}/../configs/dddqn_grid_world_2x2_learning.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space
        )

        # Create an Algo object.
        algo = DDDQN(config=dqn_config, name="my-dddqn")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests)

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.6)

        # Check learnt Q-function (using our dueling layer).
        a_and_v = algo.Q(one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4))
        q = dueling(a_and_v, np.array([0, 1, 2, 3, 0, 1, 2, 3]))
        print(q)
        self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0])  # q(s=0,a=right) is the worst
        check(q[5], 1.0, atol=0.4)  # Q(1,->) is close to 1.0.
        #self.assertTrue(q[5] > max(q[:4]) and q[5] > max(q[6:]))  # q(s=1,a=right) is the best
        #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1)  # a=up,down,left,right

        env.terminate()
コード例 #11
0
    def test_neg_log_likelihood_loss_function_w_simple_space(self):
        shape = (5, 4, 3)
        parameters_space = Tuple(Float(shape=shape),
                                 Float(shape=shape),
                                 main_axes="B")
        labels_space = Float(shape=shape, main_axes="B")

        loss_function = NegLogLikelihoodLoss(
            distribution=get_default_distribution_from_space(labels_space))

        parameters = parameters_space.sample(10)
        # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output).
        parameters = (parameters[0], np.clip(parameters[1], 0.1, 1.0))
        labels = labels_space.sample(10)

        expected_loss_per_item = np.sum(
            -np.log(sts.norm.pdf(labels, parameters[0], parameters[1])),
            axis=(-1, -2, -3))

        out = loss_function(parameters, labels)
        check(out, expected_loss_per_item, decimals=4)
コード例 #12
0
ファイル: test_optimizers.py プロジェクト: rosea-tf/surreal
    def test_apply_gradients(self):
        lr = random.random()
        optimizer = SGD(learning_rate=lr)

        # Var to optimize.
        var = tf.Variable(random.random())
        var_value_orig = var.numpy()
        # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2
        expected_grad = 2 * var_value_orig - 2.0

        # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does
        # not work here.
        with tf.GradientTape() as t:
            loss = self.L(var)

        optimizer.apply_gradients(grads_and_vars=[(t.gradient(loss, var),
                                                   var)])

        # Check against variable now. Should change by -learning_rate * grad.
        var_value_after = var.numpy()
        expected_new_value = var_value_orig - (lr * expected_grad)
        check(var_value_after, expected_new_value)
コード例 #13
0
    def test_neg_log_likelihood_loss_function_w_container_space(self):
        parameters_space = Dict(
            {
                # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output).
                "a": Tuple(Float(shape=(2, 3)), Float(
                    0.5, 1.0, shape=(2, 3))),  # normal (0.0 to 1.0)
                "b": Float(shape=(4, ), low=-1.0, high=1.0)  # 4-discrete
            },
            main_axes="B")

        labels_space = Dict({
            "a": Float(shape=(2, 3)),
            "b": Int(4)
        },
                            main_axes="B")

        loss_function = NegLogLikelihoodLoss(
            distribution=get_default_distribution_from_space(labels_space))

        parameters = parameters_space.sample(2)
        # Softmax the discrete params.
        probs_b = softmax(parameters["b"])
        # probs_b = parameters["b"]
        labels = labels_space.sample(2)

        # Expected loss: Sum of all -log(llh)
        log_prob_per_item_a = np.sum(np.log(
            sts.norm.pdf(labels["a"], parameters["a"][0], parameters["a"][1])),
                                     axis=(-1, -2))
        log_prob_per_item_b = np.array([
            np.log(probs_b[0][labels["b"][0]]),
            np.log(probs_b[1][labels["b"][1]])
        ])

        expected_loss_per_item = -(log_prob_per_item_a + log_prob_per_item_b)

        out = loss_function(parameters, labels)
        check(out, expected_loss_per_item, decimals=4)
コード例 #14
0
ファイル: test_optimizers.py プロジェクト: rosea-tf/surreal
    def test_minimize(self):
        # Test case not working w/o graph mode.
        return
        lr = random.random()
        optimizer = Adam(learning_rate=lr)

        # Var to optimize.
        var = tf.Variable(random.random())
        var_value_orig = var.numpy()
        # Derivative of loss is dL/dv = 2*(v-1.0) = 2v - 2
        expected_grad = 2 * var_value_orig - 2.0

        # Must use gradient tape as we are in eager mode. In graph mode, we would do `get_gradients`, which does
        # not work here.
        with tf.GradientTape() as t:
            loss = self.L(var)

        optimizer.minimize(loss, [var])

        # Check against variable now. Should change by -learning_rate * grad.
        var_value_after = var.numpy()
        expected_new_value = var_value_orig - (lr * expected_grad)
        check(var_value_after, expected_new_value)
コード例 #15
0
    def test_dads_learning_on_grid_world_4room(self):
        # Create an Env object.
        env = GridWorld("4-room")

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories)
        )
        # Create a Config.
        config = DADSConfig.make(
            "{}/../configs/dads_grid_world_4room_learning.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space
        )

        # Create an Algo object.
        algo = DADS(config=config, name="my-dads")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests)

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.3)

        # Check learnt Q-function.
        check(algo.q(
            np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]])
        ), [[0.8, -5.0, 0.9, 0.8], [0.8, 1.0, 0.9, 0.9]], decimals=1)  # a=up,down,left,right

        env.terminate()
コード例 #16
0
    def test_next_state_handling(self):
        """
        Tests if next-states can be stored efficiently (not using any space!) in the memory.

        NOTE: The memory does not care about terminal signals, it will always return the n-next-in-memory state
        regardless of whether this is a useful state (terminal=False) or not (terminal=True). In case of a
        terminal=True, the next state (whether it be the true terminal state, the reset state, or any other random
        state) does not matter anyway.
        """
        capacity = 10
        batch_size = 2

        # Test all classes of memories.
        for class_ in [ReplayBuffer, PrioritizedReplayBuffer]:
            memory = class_(record_space=self.record_space_no_next_state, capacity=capacity,
                            next_record_setup=dict(s="s_"))

            # Insert n records (inserts must always be batch-size).
            data = dict(
                s=dict(s1=np.array([0.0, 1.0]), s2=np.array([2.0, 3.0])),
                a=np.array([0, 1]), r=np.array([-0.0, -1.0]), t=np.array([False, True]),
                s_=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1]))
            )
            memory.add_records(data)

            # Check, whether inserting the wrong batch size raises Exception.
            try:
                data = self.record_space_no_next_state.sample(batch_size + 1)
                data["s_"] = self.record_space_no_next_state["s"].sample(batch_size)
                memory.add_records(data)
                assert False, "ERROR: Should not get here. Error is expected."
            except SurrealError:
                pass

            # Assert we can now fetch n elements.
            retrieved_data = memory.get_records(num_records=1)
            self.assertEqual(1, len(retrieved_data["t"]))

            # Check the next state.
            if retrieved_data["s"]["s1"][0] == 0.0:
                self.assertTrue(retrieved_data["s_"]["s1"] == 0.1 and retrieved_data["s_"]["s2"] == 2.1)
            else:
                self.assertTrue(retrieved_data["s"]["s1"] == 1.0)
                self.assertTrue(retrieved_data["s_"]["s1"] == 1.1 and retrieved_data["s_"]["s2"] == 3.1)

            # Insert another 2xn records and then check for correct next-state returns when getting records.
            data = dict(
                s=dict(s1=np.array([0.1, 1.1]), s2=np.array([2.1, 3.1])),
                a=np.array([2, 3]), r=np.array([-2.0, -3.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.2, 1.2]), s2=np.array([2.2, 3.2])),
                a=np.array([4, 5]), r=np.array([-4.0, -5.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=2)
                self.assertEqual(2, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(2):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 6)

            # Insert up to capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.3, 1.3]), s2=np.array([2.3, 3.3])),
                a=np.array([6, 7]), r=np.array([-6.0, -7.0]), t=np.array([True, False]),
                s_=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4]))
            )
            memory.add_records(data)
            data = dict(
                s=dict(s1=np.array([0.4, 1.4]), s2=np.array([2.4, 3.4])),
                a=np.array([8, 9]), r=np.array([-8.0, -9.0]), t=np.array([False, False]),
                s_=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=3)
                self.assertEqual(3, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(3):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)

            # Go a little bit (one batch) over capacity and check again.
            data = dict(
                s=dict(s1=np.array([0.5, 1.5]), s2=np.array([2.5, 3.5])),
                a=np.array([10, 11]), r=np.array([-10.0, -11.0]), t=np.array([True, True]),
                s_=dict(s1=np.array([0.6, 1.6]), s2=np.array([2.6, 3.6]))
            )
            memory.add_records(data)

            for _ in range(20):
                retrieved_data = memory.get_records(num_records=4)
                self.assertEqual(4, len(retrieved_data["t"]))

                # Check the next states (always 0.1 larger than state).
                for i in range(4):
                    check(retrieved_data["s"]["s1"][i], retrieved_data["s_"]["s1"][i] - 0.1)
                    check(retrieved_data["s"]["s2"][i], retrieved_data["s_"]["s2"][i] - 0.1)

            self.assertTrue(memory.size == 10)
コード例 #17
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_constant(self):
     constant = Constant.make(2.0)
     input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23])
     out = constant(input_)
     check(out, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
コード例 #18
0
    def test_sac_learning_on_grid_world_2x2(self):
        # Create an Env object.
        env = GridWorld("2x2", actors=1)

        # Add the preprocessor (not really necessary, as NN will automatically one-hot, but faster as states
        # are then stored in memory already preprocessed and won't have to be preprocessed again for batch-updates).
        preprocessor = Preprocessor(lambda inputs_: tf.one_hot(
            inputs_, depth=env.actors[0].state_space.num_categories))

        # Create a Config.
        config = SACConfig.make(
            "{}/../configs/sac_grid_world_2x2_learning.json".format(
                os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space,
            summaries=[
                "Ls_critic[0]", "L_actor", "L_alpha", "alpha",
                ("Q(0,^)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([0])})"
                 ),
                ("Q(0,->)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([1])})"
                 ),
                ("Q(0,v)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([2])})"
                 ),
                ("Q(0,<-)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([3])})"
                 ),
                ("Q(1,->)",
                 "Q[0]({'s': np.array([[0., 1., 0., 0.]]), 'a': np.array([1])})"
                 )
            ])

        # Create an Algo object.
        algo = SAC(config=config, name="my-sac")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=700, sync=True, render=debug.RenderEnvInLearningTests)

        # Check learnt Q-function.
        q = algo.Q[0](dict(s=one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]),
                                     depth=4),
                           a=np.array([0, 1, 2, 3, 0, 1, 2, 3])))
        print(q)
        self.assertTrue(q[1] < min(q[2:])
                        and q[1] < q[0])  # q(s=0,a=right) is the worst
        check(q[5], 1.0, decimals=1)  # Q(1,->) is close to 1.0.
        #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1)  # a=up,down,left,right

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.7)

        env.terminate()
コード例 #19
0
    def test_sac_loss_function(self):
        # Batch of size=2.
        input_ = {
            "s": np.random.random(size=(
                2,
                2)),  # states don't matter for this test as Q-funcs are faked.
            "a": np.array([[-0.5], [0.5]]),  # action space = Float(shape=(1,))
            "r": np.array([9.4, -1.23]),
            "t": np.array([False, False]),
            "s_": np.random.random(size=(
                2,
                2))  # states don't matter for this test as Q-funcs are faked.
        }

        # Fake pi/q-nets. Just have to be callables, returning some q-values.
        def pi(s, log_likelihood=False):
            assert log_likelihood is True
            # Return fake action sample and log-likelihoods.
            # Actions according to action-space (Float(1,)), log-likelihoods always with shape=().
            return np.array([[-0.5], [0.5]]), np.array([-0.4, -1.0])

        pi.get_weights = lambda as_ref: []
        gamma = 1.0
        q_nets = [
            lambda s_a: np.array([10.0, -90.6]),
            lambda s_a: np.array([10.1, -90.5])
        ]
        q_nets[0].get_weights = lambda as_ref: []
        q_nets[1].get_weights = lambda as_ref: []
        target_q_nets = [
            lambda s_a: np.array([12.0, -8.0]),
            lambda s_a: np.array([22.3, 10.5])
        ]
        target_q_nets[0].get_weights = lambda as_ref: []
        target_q_nets[1].get_weights = lambda as_ref: []
        alpha = tf.Variable(0.5, dtype=tf.float64)
        entropy_target = 0.97

        out = SACLoss()(
            input_, alpha, entropy_target, pi, q_nets, target_q_nets,
            namedtuple("FakeSACConfig",
                       ["gamma", "entropy_target", "optimize_alpha"])(
                           gamma=gamma,
                           entropy_target=entropy_target,
                           optimize_alpha=True))

        # Critic Loss.
        """
        Calculation:
        batch of 2, gamma=1.0
        a' = pi(s') = [-0.5, 0.5]
        a' lllh = [-0.4, -1.0] -> sampled a's log likelihoods
        Q1t(s'a') = [12 -8]
        Q2t(s'a') = [22.3 10.5]
        Qt(s'a') = [12 -8]  (reduce min over two Q-nets)
        Q1(s'a') = [10 -90.6]
        Q2(s'a') = [10.1 -90.5]
        Li = E(batch)| 0.5( (r + gamma (Qt(s'a') - alpha*log(pi(a'|s'))) ) - Qi(s,a))^2 |

        L1 = 0.5 * | (9.4 + (12 - 0.5*-0.4) - 10)^2 + (-1.23 + (-8 - 0.5*-1.0) - -90.6)^2 | / 2
        L1 = 0.5 * |  (11.6)^2 + (81.87)^2 | / 2
        L1 = 3418.62845 / 2
        L1 = 1709.314225

        L2 = 0.5 * | (9.4 + (12 - 0.5*-0.4) - 10.1)^2 + (-1.23 + (-8 - 0.5*-1.0) - -90.5)^2 | / 2
        L2 = 0.5 * |  (11.5)^2 + (81.77)^2 | / 2
        L2 = 3409.29145 / 2
        L2 = 1704.645725
        """
        expected_critic_loss = [np.array(1709.314225), np.array(1704.645725)]
        check([out[0][i].numpy() for i in range(2)],
              expected_critic_loss,
              decimals=3)

        # Actor loss.
        """
        Calculation:
        batch of 2, gamma=1.0
        log(pi(a|s)) = a lllh = [-0.4, -1.0]
        Q1(s,a) = [10.0, -90.6]
        Q2(s,a) = [10.1, -90.5]
        Q(s,a) = [10.0, -90.6]  <- reduce_min
        L = E(batch)| ( alpha * log(pi(a,s)) - Q(s,a)) |
        L = [(alpha * -0.4 - 10.0) + (alpha * -1.0 - -90.6)] / 2
        L = [(0.5*-0.4 - 10.0) + (0.5*-1.0 - - 90.6)] / 2
        L = (-10.2 + 90.1) / 2
        L = 39.95
        """
        expected_actor_loss = 39.95
        check(out[3].numpy(), expected_actor_loss, decimals=3)

        # Alpha loss.
        """
        Calculation:
        batch of 2, gamma=1.0
        H = entropy_target = 0.97
        log(pi(a|s)) = a lllh = [-0.4, -1.0]
        L = E(batch)| (-alpha * log(pi(a,s)) - alpha H) |

        # In the SAC-paper, α is used directly, however the implementation uses log(α).
        # See the discussion in https://github.com/rail-berkeley/softlearning/issues/37.

        L = [(-log(alpha) * -0.4 - log(alpha)*0.97) + (-log(alpha) * -1.0 - log(alpha) * 0.97)] / 2
        L = [(-log(0.5)*-0.4 - log(0.5)*0.97) + (-log(0.5)*-1.0 - log(0.5)*0.97)] / 2
        L = [(0.69315*-0.4 - -0.69315*0.97) + (0.69315*-1.0 + 0.69315*0.97)] / 2
        L = (0.3950955 + -0.0207945) / 2
        L = 0.1871505
        """
        expected_alpha_loss = 0.1871505
        check(out[5].numpy(), expected_alpha_loss, decimals=3)
コード例 #20
0
ファイル: test_grid_world.py プロジェクト: rosea-tf/surreal
    def test_2x2_grid_world_with_2_actors(self):
        """
        Tests a minimalistic 2x2 GridWorld with two Actors.
        """
        env = GridWorld(world="2x2", actors=2)

        # Simple test runs with fixed actions.
        # X=player's position
        env.reset_all()  # ["XH", " G"]  X=player's position

        env.act(np.array([2, 1]))  # down: [" H", "XG"], # right: [" X", " G"]
        check(env.state, [1, 0])
        check(env.reward, [-0.1, -5.0])
        check(env.terminal, [False, True])
        env.act(np.array([1, 2]))  # right: [" H", " X"], # down: [" H", "XG"]
        check(env.state, [0, 1])  # 0=state got already reset (flow envs).
        check(env.reward, [1.0, -0.1])
        check(env.terminal, [True, False])

        env.reset_all()
        env.act(np.array(
            [1, 1]))  # both Actors move right: [" X", " G"] -> in the hole
        check(env.state, [0, 0])
        check(env.reward, [-5.0, -5.0])
        check(env.terminal, [True, True])

        # Run against a wall.
        env.act(np.array([3, 0]))  # left: ["XH", " G"], up: ["XH", " G"]
        check(env.state, [0, 0])
        check(env.reward, [-0.1, -0.1])
        check(env.terminal, [False, False])
        env.act(np.array([2, 0]))  # down: [" H", "XG"], up: ["XH", " G"]
        check(env.state, [1, 0])
        check(env.reward, [-0.1, -0.1])
        check(env.terminal, [False, False])
        env.act(np.array([0, 2]))  # up: ["XH", " G"], down: [" H", "XG"]
        check(env.state, [0, 1])
        check(env.reward, [-0.1, -0.1])
        check(env.terminal, [False, False])
        env.act(np.array([1, 1]))  # right: [" X", " G"], right: [" H", " X"]
        check(env.state, [0, 0])
        check(env.reward, [-5.0, 1.0])
        check(env.terminal, [True, True])

        env.terminate()
コード例 #21
0
    def test_dqn2015_functionality(self):
        # Fake q-net/qt-net used for this test.
        def q(s, a):
            return np.sum(dense(dense(s, weights_q[0], weights_q[1]), weights_q[2], weights_q[3]) * one_hot(a, depth=4), axis=-1)

        def qt(s):
            return dense(dense(s, weights_qt[0], weights_qt[1]), weights_qt[2], weights_qt[3])

        env = GridWorld("2x2", actors=1)
        state_space = env.actors[0].state_space.with_batch()
        action_space = env.actors[0].action_space.with_batch()

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=state_space.num_categories)
        )
        preprocessed_space = preprocessor(state_space)

        # Add the Q-network.
        i = K.layers.Input(shape=preprocessed_space.shape, dtype=preprocessed_space.dtype)
        o = K.layers.Dense(2, activation="linear")(i)  # keep it very simple
        # o = K.layers.Dense(256)(o)
        q_network = K.Model(inputs=i, outputs=o)

        # Create a very simple DQN2015.
        dqn = DQN2015(config=DQN2015Config.make(
            "{}/../configs/dqn2015_grid_world_2x2_functionality.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            q_network=q_network,
            state_space=state_space,
            action_space=action_space
        ), name="my-dqn")

        # Check slot of "x" in flattened mem.
        check(dqn.memory.next_record_setup["x"][1], [3])
        self.assertTrue(dqn.memory.batch_size is None)

        check(dqn.Q.get_weights(), dqn.Qt.get_weights())

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(dqn)

        # Set our weights fixed.
        weights = [
            np.array([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4]]),  # hidden layer kernel
            np.array([0.0, 0.0]),  # hidden layer bias
            np.array([[-0.4, -0.3, -0.2, -0.1], [0.4, 0.3, 0.2, 0.1]]),  # output layer kernel
            np.array([0.1, 0.1, 1.0, 0.0])  # output layer bias
        ]
        dqn.Q.set_weights(weights)

        # Perform one step in the env.
        expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1)
        check(expected_action, 2)  # expect to go down
        env.run(ticks=1)  # ts=0 -> do nothing
        # Check action taken.
        check(dqn.a.value, expected_action)
        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn (after one time step, should still be empty).
        check(dqn.memory.size, 0)
        self.assertTrue(dqn.memory.batch_size is None)

        # Perform one step in the env.
        expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1)
        check(expected_action, 2)  # expect to go down
        env.run(ticks=1)  # ts=1 -> no sync, no update
        # Check action taken.
        check(dqn.a.value, expected_action)
        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn.
        check(dqn.memory.size, 1)
        self.assertTrue(dqn.memory.batch_size == 1)  # batch_size is now established.
        check(dqn.memory.memory, [
            np.array([2, 0, 0, 0]),
            np.array([-0.1, 0., 0., 0.]),
            np.array([False, False, False, False]),
            np.array([[1., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]])
        ])
        # Check next states.
        check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]])

        # Perform one step in the env.
        # What are the weights after the update?
        weights_q_before_update = dqn.Q.get_weights()
        weights_q = copy.deepcopy(weights_q_before_update)
        weights_qt = dqn.Qt.get_weights()

        # Check action taken (action is picked before! update).
        expected_action = np.argmax(dqn.Q(dqn.Phi(np.array([1]))), axis=-1)

        env.run(ticks=1)  # ts=2 -> no sync, do update
        weights_q_after_update = dqn.Q.get_weights()
        check(dqn.a.value, expected_action)

        # Check new weight values after the update.
        loss = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config)
        for i, matrix in enumerate(weights_q_before_update):
            for idx in np.ndindex(matrix.shape):
                weights_q = copy.deepcopy(weights_q_before_update)
                weights_q[i][idx] += 0.0001
                lossd = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config)
                dL_over_dw = (lossd - loss) / 0.0001
                check(weights_q_after_update[i][idx], weights_q_before_update[i][idx] - dL_over_dw * dqn.optimizer.learning_rate(0.0), decimals=3)

        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn.
        check(dqn.memory.size, 2)
        check(dqn.memory.memory, [
            np.array([2, 2, 0, 0]),
            np.array([-0.1, -0.1, 0., 0.]),
            np.array([False, False, False, False]),
            np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]])
        ])
        # Check next states.
        check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]])

        env.terminate()
コード例 #22
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_linear_decay(self):
     linear_decay = LinearDecay.make({"from": 2.0, "to": 0.5})
     input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23])
     out = linear_decay(input_)
     check(out, 2.0 - input_ * (2.0 - 0.5))
コード例 #23
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_linear_decay_with_step_function(self):
     linear_decay = LinearDecay.make({"from": 2.0, "to": 0.5, "begin_time_percentage": 0.5, "end_time_percentage": 0.6})
     input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23, 0.51, 0.52, 0.55, 0.59])
     out = linear_decay(input_)
     check(out, np.array([2.0, 2.0, 0.5, 0.5, 2.0, 2.0, 0.5, 2.0, 1.85, 1.7, 1.25, 0.65]))
コード例 #24
0
    def test_update_records(self):
        memory = PrioritizedReplayBuffer(record_space=self.record_space, capacity=self.capacity)

        # Insert record samples.
        num_records = 2
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Fetch records, their indices and weights.
        batch, indices, weights = memory.get_records_with_indices_and_weights(num_records)
        check(weights, np.ones(shape=(num_records,)))
        self.assertEqual(num_records, len(indices))
        self.assertTrue(memory.size == num_records)
        self.assertTrue(memory.index == num_records)

        # Update weight of index 0 to very small.
        memory.update_records(np.array([0]), np.array([0.01]))
        # Expect to sample almost only index 1 (which still has a weight of 1.0).
        for _ in range(100):
            _, indices, weights = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 970)

        # Update weight of index 1 to very small as well.
        # Expect to sample equally.
        for _ in range(100):
            rand = np.random.random()
            memory.update_records(np.array([0, 1]), np.array([rand, rand]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 400)
            self.assertLessEqual(np.sum(indices), 600)

        # Update weights to be 1:2.
        # Expect to sample double as often index 1 over index 0 (1.0 = 2* 0.5).
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 2]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 600)
            self.assertLessEqual(np.sum(indices), 750)

        # Update weights to be 1:4.
        # Expect to sample quadruple as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 4]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 750)
            self.assertLessEqual(np.sum(indices), 850)

        # Update weights to be 1:9.
        # Expect to sample 9 times as often index 1 over index 0.
        for _ in range(100):
            rand = np.random.random() * 10
            memory.update_records(np.array([0, 1]), np.array([rand, rand * 9]))
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=1000)
            self.assertGreaterEqual(np.sum(indices), 850)
            self.assertLessEqual(np.sum(indices), 950)

        # Insert more record samples.
        num_records = 10
        data = self.record_space.sample(num_records)
        memory.add_records(data)
        self.assertTrue(memory.size == self.capacity)
        self.assertTrue(memory.index == 2)

        # Update weights to be 1.0 to 10.0 and sample a < 10 batch.
        memory.update_records(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                              np.array([0.1, 1., 3., 8., 16., 32., 64., 128., 256., 512.]))
        counts = Counter()
        for _ in range(1000):
            _, indices, _ = memory.get_records_with_indices_and_weights(num_records=np.random.randint(1, 6))
            for i in indices:
                counts[i] += 1
        print(counts)
        self.assertTrue(
            counts[9] >= counts[8] >= counts[7] >= counts[6] >= counts[5] >=
            counts[4] >= counts[3] >= counts[2] >= counts[1] >= counts[0]
        )
コード例 #25
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_polynomial_parameter(self):
     polynomial_decay = Decay.make(type="polynomial-decay", from_=2.0, to_=0.5, power=2.0)
     input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23])
     out = polynomial_decay(input_)
     check(out, (2.0 - 0.5) * (1.0 - input_) ** 2 + 0.5)
コード例 #26
0
ファイル: test_decays.py プロジェクト: rosea-tf/surreal
 def test_exponential_parameter(self):
     exponential_decay = Decay.make(type="exponential-decay", from_=2.0, to_=0.5, decay_rate=0.5)
     input_ = np.array([0.5, 0.1, 1.0, 0.9, 0.02, 0.01, 0.99, 0.23])
     out = exponential_decay(input_)
     check(out, 0.5 + (2.0 - 0.5) * 0.5 ** input_)