def loop( env: gym.Env, agent: agents.Agent, num_episodes: int, epsilon: float, should_learn: bool, ): wins = 0 episode = 1 observation = env.reset() env.render() while episode <= num_episodes: if random() < epsilon: action = env.action_space.sample() else: action = agent.act(observation) new_observation, reward, done, info = env.step(action) if should_learn: agent.learn(observation, action, new_observation, reward) env.render() print("\tEpisodes:", episode, "\tWins:", wins) print("\tWin Ratio:", wins / episode) if done: observation = env.reset() env.render() episode += 1 if reward: wins += 1 else: observation = new_observation env.close()
_agent = Agent((8,), 4) if os.path.exists(_f_checkpoint): _agent.net.load_checkpoint(_f_checkpoint) _writer = SummaryWriter(_d_log) _is_quit = False while _episode < _n_games: _observation = _env.reset() _done = False _score = 0.0 while not _done: _action = _agent.get_action(_observation) _next_observation, _reward, _done, _info = _env.step(_action) _score += _reward _agent.learn(_observation, _reward, _next_observation, _done) _observation = _next_observation _rgb = _env.render("rgb_array") _bgr = cv2.cvtColor(_rgb, cv2.COLOR_RGB2BGR) cv2.imshow("frame", _bgr) _key_code = cv2.waitKey(1) if _key_code in [27, ord('q')]: _is_quit = True break if _is_quit: break _scores.append(_score) _episode += 1 _avg_score = float(np.mean(_scores[-100:])) if _episode % 500 == 0: