コード例 #1
0
def test_delayed_learning():
    possible_prices = [0.0, 1.0, 2.0, 3.0]
    agent = DQN(decision=EpsilonGreedy(eps=0.0),
                replay_memory=ReplayBuffer(50),
                batch_size=1)
    state = tuple(random.choices(possible_prices, k=2))
    agent.play_price(state, possible_prices, 0, 0)
    weights_before_learning = copy.deepcopy(agent.qnetwork_local.get_weights())
    agent.learn(
        previous_reward=1.0,
        reward=10.0,
        previous_action=0.0,
        action=np.float64(1.0),
        action_space=possible_prices,
        previous_state=state,
        state=state,
        next_state=state,
    )
    assert np.equal(np.array(weights_before_learning[0]),
                    np.array(agent.qnetwork_local.get_weights()[0])).all()
    agent.learn(
        previous_reward=1.0,
        reward=10.0,
        previous_action=0.0,
        action=np.float64(1.0),
        action_space=possible_prices,
        previous_state=state,
        state=state,
        next_state=state,
    )
    assert (np.equal(np.array(weights_before_learning[0]),
                     np.array(agent.qnetwork_local.get_weights()[0])).all() ==
            False  # noqa E712
            )
コード例 #2
0
def test_play_optimal_action():
    possible_prices = [1.0, 2.0]
    agent = DQN(decision=EpsilonGreedy(eps=0.0),
                replay_memory=ReplayBuffer(50),
                batch_size=1)
    state = tuple(random.choices(possible_prices, k=2))
    agent.play_price(state, possible_prices, 0, 0)
    for _ in range(10):
        agent.learn(
            previous_reward=1.0,
            reward=10.0,
            previous_action=0.0,
            action=np.float64(1.0),
            action_space=possible_prices,
            previous_state=state,
            state=state,
            next_state=state,
        )
        agent.learn(
            previous_reward=1.0,
            reward=-10.0,
            previous_action=0.0,
            action=np.float64(2.0),
            action_space=possible_prices,
            previous_state=state,
            state=state,
            next_state=state,
        )
    assert agent.play_price(state, possible_prices, 1, 1) == 1.0
コード例 #3
0
def test_update_network():
    possible_prices = [0.0, 1.0, 2.0, 3.0]
    agent = DQN(decision=EpsilonGreedy(eps=0.0),
                replay_memory=ReplayBuffer(50),
                batch_size=1,
                update_target_after=10)
    state = tuple(random.choices(possible_prices, k=2))
    agent.play_price(state, possible_prices, 0, 0)
    assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]),
                      np.array(agent.qnetwork_target.get_weights()[0])).all()
    for _ in range(10):
        agent.learn(
            previous_reward=1.0,
            reward=10.0,
            previous_action=0.0,
            action=np.float64(1.0),
            action_space=possible_prices,
            previous_state=state,
            state=state,
            next_state=state,
        )
    assert (np.equal(np.array(agent.qnetwork_local.get_weights()[0]),
                     np.array(agent.qnetwork_target.get_weights()[0])).all() ==
            False  # noqa E712, W503
            )
    agent.learn(
        previous_reward=1.0,
        reward=10.0,
        previous_action=0.0,
        action=np.float64(1.0),
        action_space=possible_prices,
        previous_state=state,
        state=state,
        next_state=state,
    )
    assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]),
                      np.array(agent.qnetwork_target.get_weights()[0])).all()
コード例 #4
0
def test_random_action_selection():
    np.random.seed(1)
    possible_prices = [1.0, 2.0]
    agent = DiffDQN(decision=EpsilonGreedy(eps=0.0),
                    replay_memory=ReplayBuffer(2),
                    batch_size=1)
    state = tuple(random.choices(possible_prices, k=2))
    agent.play_price(state, possible_prices, 0, 0)

    # same action values (all weights are zreo)
    agent.play_price(state, possible_prices, 0, 0)
    weights = agent.qnetwork_local.get_weights()
    for w in weights:
        w[w != 0.0] = 0.0
    agent.qnetwork_local.set_weights(weights)
    played_prices = []
    for _ in range(10):
        played_prices.append(agent.play_price(state, possible_prices, 0, 0))
    assert len(set(played_prices)) == 2

    # learn that 1.0 is better
    for _ in range(10):
        agent.learn(
            previous_reward=1.0,
            reward=10.0,
            previous_action=0.0,
            action=np.float64(1.0),
            action_space=possible_prices,
            previous_state=state,
            state=state,
            next_state=state,
        )
    played_prices = []
    for _ in range(10):
        played_prices.append(agent.play_price(state, possible_prices, 0, 0))
    assert len(set(played_prices)) == 1
    assert list(set(played_prices))[0] == 1
コード例 #5
0
ファイル: test_buffer.py プロジェクト: mesjou/price_simulator
def test_init():
    buffer = ReplayBuffer(buffer_size=100)
    assert len(buffer) == 0
コード例 #6
0
ファイル: test_buffer.py プロジェクト: mesjou/price_simulator
def test_add_and_sample():
    buffer = ReplayBuffer(buffer_size=100)
    buffer.add(state=1, action=2, reward=3, next_state=4)
    assert len(buffer) == 1
    assert buffer.sample(batch_size=1) == (1, 2, 3, 4)

    states = np.array([[1, 1, 1]] * 20)
    actions = np.array([[2, 2, 2]] * 20)
    rewards = np.array([[3, 3, 3]] * 20)
    next_states = np.array([[4, 4, 4]] * 20)
    buffer = ReplayBuffer(buffer_size=100)
    for _ in range(100):
        buffer.add(state=[1, 1, 1],
                   action=[2, 2, 2],
                   reward=[3, 3, 3],
                   next_state=[4, 4, 4])
    assert len(buffer) == 100
    assert np.array(
        buffer.sample(batch_size=20) == np.array(
            [states, actions, rewards, next_states])).all()

    buffer = ReplayBuffer(buffer_size=10)
    for _ in range(10):
        buffer.add(state=[1, 1, 1],
                   action=[2, 2, 2],
                   reward=[3, 3, 3],
                   next_state=[4, 4, 4])
    buffer.add(state=[100, 1, 1],
               action=[2, 2, 2],
               reward=[3, 3, 3],
               next_state=[4, 4, 4])
    assert len(buffer) == 10
    assert np.array(
        buffer.sample(batch_size=10) != np.array(
            [states, actions, rewards, next_states])).all()