for e in range(n_epochs): # reset frame = 0 loss = 0.0 Q_max = 0.0 total_reward = 0 env.reset() state_t_1, reward_t, terminal = env.step( np.random.choice(env.enable_actions)) while not terminal: state_t = state_t_1 # execute action in environment if steps > warmup: action_t = agent.select_action(state_t) else: action_t = np.random.choice(env.enable_actions) # observe environment state_t_1, reward_t, terminal = env.step(action_t) total_reward += reward_t # store experience agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal) print(agent.tmp_q_values, np.argmax(agent.tmp_q_values), agent.enable_actions.index(action_t)) # for log frame += 1