def append(self, observation, action, reward, done, next_observation): # append data: if the game is not finished, append it; if a game is finished, append the game to self.buffer if self.symbolic_env: self.buffer[self.game_idx]["obs"].append(observation.numpy()) else: self.buffer[self.game_idx]["obs"].append( postprocess_observation(observation.numpy(), self.bit_depth)) self.buffer[self.game_idx]["action"].append(action.numpy()) self.buffer[self.game_idx]["reward"].append(reward) self.buffer[self.game_idx]["nonterminal"].append(not done) # print(self.buffer[self.game_idx]["nonterminal"]) self.num_steps += 1 if done: # when a game is finished, change the idx of game and init new game_buffer self.buffer[ self.game_idx]["terminal_obs"] = postprocess_observation( next_observation.numpy(), self.bit_depth) self.game_idx = ( self.game_idx + 1 ) % self.size # when buffer is full, filling buffer begins from head self.full = self.full or self.game_idx == 0 self.buffer[self.game_idx] = { "obs": [], "action": [], "reward": [], "nonterminal": [] } self.num_game += 1
def append(self, observation, action, reward, done): self.observations[self.idx] = postprocess_observation( observation.numpy(), self.bit_depth ) # Decentre and discretise visual observations (to save memory) self.actions[self.idx] = action.numpy() self.rewards[self.idx] = reward self.nonterminals[self.idx] = not done self.idx = (self.idx + 1) % self.size self.full = self.full or self.idx == 0 #flag self.steps, self.episodes = self.steps + 1, self.episodes + (1 if done else 0)
def append(self, observation, action, reward, done): if self.symbolic_env: self.observations[self.idx] = observation.numpy() else: self.observations[self.idx] = postprocess_observation( observation.numpy(), self.bit_depth ) # Decentre and discretise visual observations (to save memory) self.actions[self.idx] = action.numpy() if isinstance( action, torch.Tensor) else action self.rewards[self.idx] = reward self.nonterminals[self.idx] = not done if done: self.ends_idx.append(self.idx) self.idx = (self.idx + 1) % self.size self.full = self.full or self.idx == 0 self.steps, self.episodes = self.steps + 1, self.episodes + (1 if done else 0)
# Initialise dataset D with S random seed episodes mapping = { ord("a"): 4, ord("d"): 3, ord("s"): 5, ord("q"): 2, ord("e"): 1, 32: 0, ord("l"): 0 } for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 step = 0 while not done: print(observation.shape) x = postprocess_observation(observation.numpy(), args.bit_depth) cv2.imshow("Tetris", cv2.resize(x[0].transpose(1, 2, 0), (512, 512))) key = cv2.waitKey() if key == 27: break idx = mapping[key] action = np.zeros((6, )) action[idx] = 1 step += 1 # action = env.sample_random_action() next_observation, reward, done = env.step(action) print(reward, done) if t > 2000: done = True D.append((x, action, reward, done))