state_t_1, reward_t, terminal = env.step(action_t) total_reward += reward_t # store experience agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal) print(agent.tmp_q_values, np.argmax(agent.tmp_q_values), agent.enable_actions.index(action_t)) # for log frame += 1 steps += 1 if steps > warmup: loss += agent.current_loss Q_max += np.max(agent.Q_values([state_t])) # experience replay # warmup中は学習しない if steps > warmup: agent.backword() if steps % n_update_target_network: agent.update_target() print( "epoch: {:03d}/{:03d} | loss: {:.4f} | Q_max: {:.4f} | total reward: {} | steps: {}" .format(e, n_epochs - 1, loss / frame, Q_max / frame, total_reward, steps)) except KeyboardInterrupt: