while not terminated_1 and not terminated_2: # the first agent # print("agent 1") action_id = agent_1.action_based_on_policy(state_1, env) one_hot_action = one_hot(action_id, nr_actions) new_state, reward, terminated_1 = env.step(action_id, state_1) scaled_state_1 = scale_state(state_1, env) #histories_1.appending(reward, scaled_state_1, one_hot_action) plt.scatter(state_1[0, 0], state_1[0, 1], s=100, c='#C1C7C9', marker='s') plt.scatter(new_state[0, 0], new_state[0, 1], s=50, c='red') plt.show() plt.pause(0.1) state_1, steps_1 = update_state_step(new_state, steps_1) # the second agent # print("agent 2") action_id = agent_2.action_based_on_policy(state_2, env) one_hot_action = one_hot(action_id, nr_actions) new_state, reward, terminated_2 = env.step(action_id, state_2) scaled_state_2 = scale_state(state_2, env) #histories_2.appending(reward, scaled_state_2, one_hot_action) plt.scatter(state_2[0, 0], state_2[0, 1], s=100, c='#C1C7C9', marker='s') plt.scatter(new_state[0, 0], new_state[0, 1], s=50, c='blue') plt.show() plt.pause(0.1) state_2, steps_2 = update_state_step(new_state, steps_2)
while not terminated: action_id = agent_ler.action_based_on_Q_target(state, env, epsilon=epsilon) new_state, reward, terminated, info = env.step(action_id) new_state = single_shape_adaptor(new_state, nr_features) this_event = event(state, action_id, reward, new_state, terminated, env) replay_buffer.consider_this_event(this_event) state, steps = update_state_step(new_state, steps) print("... " + str(steps) + " new event are added to the replay_buffer") print("... updating the Q started") for k in range(K): current_batch = replay_buffer.return_a_batch(batchsize=64) agent_ler.learn(current_batch, env) print("... updating the Q finished") print("... the Q-target update is started.")