while not terminal: state_t = state_t_1 # execute action in environment if steps > warmup: action_t = agent.select_action(state_t) else: action_t = np.random.choice(env.enable_actions) # observe environment state_t_1, reward_t, terminal = env.step(action_t) total_reward += reward_t # store experience agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal) print(agent.tmp_q_values, np.argmax(agent.tmp_q_values), agent.enable_actions.index(action_t)) # for log frame += 1 steps += 1 if steps > warmup: loss += agent.current_loss Q_max += np.max(agent.Q_values([state_t])) # experience replay # warmup中は学習しない if steps > warmup: agent.backword()