def autoplay(method, environment, resume, render): game = Game(name=environments_to_names[environment], render=render) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, 1, 1, eps_start=0.0) agent.load(resume) log.info(f'Evaluating agent, loaded from {resume}, starting ...') game.reset() state = game.get_state() for t in count(): state = game.get_state() action = agent.select_action(state) transition, done = game.step(int(action.cpu().numpy())) # agent.eval( # transition, 1, 0.0) time.sleep(0.1) if done: log.info(f'agent survived {t} steps') game.reset() break game.env.close()
def play(environment): game = Game(name=environments_to_names[environment], render=True) done = False try: while not done: action = click.prompt('Please enter an action (0, 1, 2, 3..)') done = game.step(int(action)) log.info('[INFO] done ...') except KeyboardInterrupt: log.info("[INFO] quiting ...") exit()
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr, eval_only, replay_width, batch_size, gamma, update_rate, save_interval): history = History(method + '_' + environment, ['steps', 'avg_reward', 'loss'], resume is not None) history.flush() memory = ReplayMemory(replay_width) game = Game(name=environments_to_names[environment], memory=memory, render=False) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, episodes, update_rate, step_size=lr_episodes, lr=lr, save_interval=save_interval) # resume from a ckpt if resume is not None: agent.load(resume) avg_reward = MovingAverage(100) avg_loss = MovingAverage(100) log.info(f'Training with {episodes}, starting ...') # main training loop for i in range(episodes): state = game.reset() done = False loss = None while not done: state = game.state action = agent.select_action(state) transition, done = game.step(int(action.to('cpu').numpy())) if len(memory) > batch_size: batched = memory.sample(batch_size) loss = agent.train(batched, batch_size, gamma, i) avg_loss.add(loss) reward = game.rewards # agent.save_best(reward) agent.save() agent.scheduler.step() avg_reward.add(reward) # moving averages text = [ f'steps: {agent.step_cnt}', f'game epochs: {i}/{episodes}', f'train loss: {float(avg_loss):.5}', f'avg reward: {float(avg_reward):.5}', # f'best reward: {float(agent.best_reward):.5}', f'reward: {float(reward):.5}', f'epsilon: {agent.epsilon:.3}', ] log.info(', '.join(text), update=True) if agent.step_cnt % save_interval == 0: history.record({ 'steps': agent.step_cnt, 'avg_reward': float(avg_reward), 'loss': float(avg_loss), }) game.env.close()
state, reward, done = env.render() # apparently the Conv2D wants a 4D shape like (1,64,64,1) state = shapeState(state) for time in range(500): action = agent.act(state) #!# in the subsequent line, it is unclear what is the object that normally would be # assigned to next_state, but # in the original code the variable is overwritten by the `next_state = shapeState(pix)` # i.e. with image representation of the state; # the goal seems to be to obtain the reward & done values from the `env.action()` call; # while the subsequent call `pix = env.render()` is to actually get the image capture... # next_state, reward, done, _ = env.step(action) # pix = env.render() env.step(action) pix, reward, done = env.render() frames.append(pix) next_state = shapeState(pix) reward = reward if not done else -500 print(time, ')', reward) agent.write_state(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, time, agent.epsilon)) if time > max_score: max_score = time best = frames break