def test_delayed_learning(): possible_prices = [0.0, 1.0, 2.0, 3.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) weights_before_learning = copy.deepcopy(agent.qnetwork_local.get_weights()) agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert np.equal(np.array(weights_before_learning[0]), np.array(agent.qnetwork_local.get_weights()[0])).all() agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert (np.equal(np.array(weights_before_learning[0]), np.array(agent.qnetwork_local.get_weights()[0])).all() == False # noqa E712 )
def test_play_optimal_action(): possible_prices = [1.0, 2.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) agent.learn( previous_reward=1.0, reward=-10.0, previous_action=0.0, action=np.float64(2.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert agent.play_price(state, possible_prices, 1, 1) == 1.0
def test_update_network(): possible_prices = [0.0, 1.0, 2.0, 3.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1, update_target_after=10) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all() for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert (np.equal(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all() == False # noqa E712, W503 ) agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all()
def test_random_action_selection(): np.random.seed(1) possible_prices = [1.0, 2.0] agent = DiffDQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(2), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) # same action values (all weights are zreo) agent.play_price(state, possible_prices, 0, 0) weights = agent.qnetwork_local.get_weights() for w in weights: w[w != 0.0] = 0.0 agent.qnetwork_local.set_weights(weights) played_prices = [] for _ in range(10): played_prices.append(agent.play_price(state, possible_prices, 0, 0)) assert len(set(played_prices)) == 2 # learn that 1.0 is better for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) played_prices = [] for _ in range(10): played_prices.append(agent.play_price(state, possible_prices, 0, 0)) assert len(set(played_prices)) == 1 assert list(set(played_prices))[0] == 1
def test_init(): buffer = ReplayBuffer(buffer_size=100) assert len(buffer) == 0
def test_add_and_sample(): buffer = ReplayBuffer(buffer_size=100) buffer.add(state=1, action=2, reward=3, next_state=4) assert len(buffer) == 1 assert buffer.sample(batch_size=1) == (1, 2, 3, 4) states = np.array([[1, 1, 1]] * 20) actions = np.array([[2, 2, 2]] * 20) rewards = np.array([[3, 3, 3]] * 20) next_states = np.array([[4, 4, 4]] * 20) buffer = ReplayBuffer(buffer_size=100) for _ in range(100): buffer.add(state=[1, 1, 1], action=[2, 2, 2], reward=[3, 3, 3], next_state=[4, 4, 4]) assert len(buffer) == 100 assert np.array( buffer.sample(batch_size=20) == np.array( [states, actions, rewards, next_states])).all() buffer = ReplayBuffer(buffer_size=10) for _ in range(10): buffer.add(state=[1, 1, 1], action=[2, 2, 2], reward=[3, 3, 3], next_state=[4, 4, 4]) buffer.add(state=[100, 1, 1], action=[2, 2, 2], reward=[3, 3, 3], next_state=[4, 4, 4]) assert len(buffer) == 10 assert np.array( buffer.sample(batch_size=10) != np.array( [states, actions, rewards, next_states])).all()