terminal) print(agent.tmp_q_values, np.argmax(agent.tmp_q_values), agent.enable_actions.index(action_t)) # for log frame += 1 steps += 1 if steps > warmup: loss += agent.current_loss Q_max += np.max(agent.Q_values([state_t])) # experience replay # warmup中は学習しない if steps > warmup: agent.backword() if steps % n_update_target_network: agent.update_target() print( "epoch: {:03d}/{:03d} | loss: {:.4f} | Q_max: {:.4f} | total reward: {} | steps: {}" .format(e, n_epochs - 1, loss / frame, Q_max / frame, total_reward, steps)) except KeyboardInterrupt: pass finally: # ブラウザーを終了する。 env.driver.quit() # save model