def test_explore(): assert EpsilonGreedy(eps=0.5).explore(1, 1) in [True, False] assert EpsilonGreedy(eps=1).explore(1, 1) is True assert EpsilonGreedy(eps=0).explore(1, 1) is False assert DecreasingEpsilonGreedy().explore(1, 1) in [True, False] assert Temperature().explore(1, 1) in [True, False] assert Temperature(beta=0).explore(1, 1) is True
def test_environment_prisoners(): test_1 = DiscreteSynchronEnvironment( n_periods=10000, possible_prices=[2, 3], demand=PrisonersDilemmaDemand(), agents=[ Qlearning(discount=0.95, learning_rate=0.3, decision=EpsilonGreedy(eps=0.1)), Qlearning(discount=0.95, learning_rate=0.3, decision=EpsilonGreedy(eps=0.1)), ], ) test_2 = DiscreteSynchronEnvironment( n_periods=10, possible_prices=[1, 2], demand=PrisonersDilemmaDemand(), agents=[Qlearning(discount=0.95, learning_rate=0.5, decision=DecreasingEpsilonGreedy()), AlwaysDefectAgent()], ) test_3 = DiscreteSynchronEnvironment( n_periods=10000, possible_prices=[1, 2], demand=PrisonersDilemmaDemand(), agents=[ Qlearning(discount=0.95, learning_rate=0.5, decision=DecreasingEpsilonGreedy()), Qlearning(discount=0.5, learning_rate=0.1, decision=DecreasingEpsilonGreedy()), ], ) assert test_1.play_game() assert test_2.play_game() assert test_3.play_game()
def test_correct_init(): env = DiscreteSynchronEnvironment( n_periods=100, n_prices=100, history_after=0, agents=[ Qlearning(decision=EpsilonGreedy(eps=1.0)), Qlearning(decision=EpsilonGreedy(eps=1.0)) ], ) env.play_game() prices = np.array(env.price_history) assert np.all(prices[:, 1] == prices[:, 0]) == False # noqa E712
def test_delayed_learning(): possible_prices = [0.0, 1.0, 2.0, 3.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) weights_before_learning = copy.deepcopy(agent.qnetwork_local.get_weights()) agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert np.equal(np.array(weights_before_learning[0]), np.array(agent.qnetwork_local.get_weights()[0])).all() agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert (np.equal(np.array(weights_before_learning[0]), np.array(agent.qnetwork_local.get_weights()[0])).all() == False # noqa E712 )
def test_play_optimal_action(): possible_prices = [1.0, 2.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) agent.learn( previous_reward=1.0, reward=-10.0, previous_action=0.0, action=np.float64(2.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert agent.play_price(state, possible_prices, 1, 1) == 1.0
def test_factory_init(): env = DiscreteSynchronEnvironment( n_periods=1, n_prices=100, agents=[ DQN(decision=EpsilonGreedy(eps=1.0), marginal_cost=0.0), DQN(decision=EpsilonGreedy(eps=1.0), marginal_cost=2.0), ], ) env.play_game() assert np.array(env.agents[0].replay_memory.sample(1)[0] == env.agents[1].replay_memory.sample(1)[0]).all() assert np.array(env.agents[0].replay_memory.sample(1)[3] == env.agents[1].replay_memory.sample(1)[3]).all() assert np.array(env.agents[0].replay_memory.sample(1)[2] != env.agents[1].replay_memory.sample(1)[2]).all()
def test_environment_advanced_qlearning(): test_1 = DiscreteSynchronEnvironment( n_periods=10000, possible_prices=[2, 3], demand=LogitDemand(), agents=[ Qlearning(discount=0.95, learning_rate=0.3, decision=EpsilonGreedy(eps=0.1)), Qlearning( discount=0.95, learning_rate=0.3, marginal_cost=4.0, quality=5.0, decision=EpsilonGreedy(eps=0.1) ), AlwaysDefectAgent(marginal_cost=0.1), ], ) assert test_1.play_game()
def test_update_network(): possible_prices = [0.0, 1.0, 2.0, 3.0] agent = DQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(50), batch_size=1, update_target_after=10) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all() for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert (np.equal(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all() == False # noqa E712, W503 ) agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) assert np.isclose(np.array(agent.qnetwork_local.get_weights()[0]), np.array(agent.qnetwork_target.get_weights()[0])).all()
def test_random_action_selection(): np.random.seed(1) possible_prices = [1.0, 2.0] agent = DiffDQN(decision=EpsilonGreedy(eps=0.0), replay_memory=ReplayBuffer(2), batch_size=1) state = tuple(random.choices(possible_prices, k=2)) agent.play_price(state, possible_prices, 0, 0) # same action values (all weights are zreo) agent.play_price(state, possible_prices, 0, 0) weights = agent.qnetwork_local.get_weights() for w in weights: w[w != 0.0] = 0.0 agent.qnetwork_local.set_weights(weights) played_prices = [] for _ in range(10): played_prices.append(agent.play_price(state, possible_prices, 0, 0)) assert len(set(played_prices)) == 2 # learn that 1.0 is better for _ in range(10): agent.learn( previous_reward=1.0, reward=10.0, previous_action=0.0, action=np.float64(1.0), action_space=possible_prices, previous_state=state, state=state, next_state=state, ) played_prices = [] for _ in range(10): played_prices.append(agent.play_price(state, possible_prices, 0, 0)) assert len(set(played_prices)) == 1 assert list(set(played_prices))[0] == 1
def test_play_price(): agent = DQN(decision=EpsilonGreedy(eps=0.0)) assert agent.play_price((1.0, ), [1.0, 1.0], 0, 0) == 1.0 agent = DiffDQN(decision=EpsilonGreedy(eps=0.0)) assert agent.play_price((1.0, ), [1.0, 1.0], 0, 0) == 1.0
def test_epsilon_greedy(): for e in [0.00001, 0.5, 0.1, 0.999999]: assert EpsilonGreedy(eps=e).epsilon(100000, 100000) == e
def test_play_price(): agent = Qlearning(decision=EpsilonGreedy(eps=0.0)) p = agent.play_price((1.0, 1.0), [1.0, 2.0], 0, 0) assert p == 1.0 or p == 2.0