class AtariWrapper():
    """
    ALE wrapper that tries to mimic the options in the DQN paper including the
    preprocessing (except resizing/cropping)
    """
    action_words = [
        'NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', "UPRIGHT", "UPLEFT",
        "DOWNRIGHT", "DOWNLEFT"
    ]
    _action_set = [0, 2, 3, 4, 5, 6, 7, 8, 9]
    # Valid actions for ALE.
    # Possible actions are just a list from 0,num_valid_actions
    # We still need to map from the latter to the former when

    possible_actions = list(range(len(_action_set)))

    def __init__(self,
                 rom_path,
                 seed=123,
                 frameskip=4,
                 show_display=False,
                 stack_num_states=4,
                 concatenate_state_every=4):
        """

        Parameters:
            Frameskip should be either a tuple (indicating a random range to
            choose from, with the top value exclude), or an int. It's aka action repeat.

            stack_num_states: Number of dimensions/channels to have.

            concatenate_state_every: After how many frames should one channel be appended to state.
                Number is in terms of absolute frames independent of frameskip
        """

        self.stack_num_states = stack_num_states
        self.concatenate_state_every = concatenate_state_every

        self.game_path = rom_path
        if not os.path.exists(self.game_path):
            raise IOError('You asked for game %s but path %s does not exist' %
                          (game, self.game_path))
        self.frameskip = frameskip

        try:
            self.ale = ALEInterface()
        except Exception as e:
            print(
                "ALEInterface could not be loaded. ale_python_interface import failed"
            )
            raise e

        # Set some default options
        self.ale.setInt(b'random_seed', seed)
        self.ale.setBool(b'sound', False)
        self.ale.setBool(b'display_screen', show_display)
        self.ale.setFloat(b'repeat_action_probability', 0.)

        # Load the rom
        self.ale.loadROM(self.game_path)

        (self.screen_width, self.screen_height) = self.ale.getScreenDims()
        self.latest_frame_fifo = deque(
            maxlen=2)  # Holds the two closest frames to max.
        self.state_fifo = deque(maxlen=stack_num_states)

    def _step(self, a, force_noop=False):
        """Perform one step of the environment.
        Automatically repeats the step self.frameskip number of times

        parameters:
            force_noop: Force it to perform a no-op ignoring the action supplied.
        """
        assert a in self.possible_actions + [0]

        if force_noop:
            action, num_steps = 0, 1
        else:
            action = self._action_set[a]

        if isinstance(self.frameskip, int):
            num_steps = self.frameskip
        else:
            num_steps = np.random.randint(self.frameskip[0], self.frameskip[1])

        reward = 0.0
        for i in range(num_steps):
            reward += self.ale.act(action)
            cur_frame = self.observe_raw(get_rgb=True)
            cur_frame_cropped = self.crop_frame(cur_frame)
            self.latest_frame_fifo.append(cur_frame_cropped)

            if i % self.concatenate_state_every == 0:
                curmax_frame = np.amax(self.latest_frame_fifo, axis=0)
                frame_lumi = self.convert_to_gray(curmax_frame)
                self.state_fifo.append(frame_lumi)

        # Transpose so we get HxWxC instead of CxHxW
        self.current_frame = np.array(np.transpose(self.state_fifo, (1, 2, 0)))
        self.current_frame = cv2.resize(self.current_frame, (84, 84))
        return self.current_frame, reward, self.ale.game_over(), {
            "ale.lives": self.ale.lives()
        }

    def step(self, *args, **kwargs):
        """Performs one step of the environment
        """
        lives_before = self.ale.lives()
        next_state, reward, done, info = self._step(*args, **kwargs)
        lives_after = self.ale.lives()

        # End the episode when a life is lost
        if lives_before > lives_after:
            done = True

        return next_state, reward, done, info

    def observe_raw(self, get_rgb=False):
        """Observe either RGB or Gray frames.
        Initialzing arrays forces it to not modify stale pointers
        """
        if get_rgb:
            cur_frame_rgb = np.zeros(
                (self.screen_height, self.screen_width, 3), dtype=np.uint8)
            self.ale.getScreenRGB(cur_frame_rgb)
            return cur_frame_rgb
        else:
            cur_frame_gray = np.zeros((self.screen_height, self.screen_width),
                                      dtype=np.uint8)
            self.ale.getScreenGrayscale(cur_frame_gray)
            return cur_frame_gray

    def crop_frame(self, frame):
        """Simply crops a frame. Does nothing by default.
        """
        return frame

    def convert_to_gray(self, img):
        """Get Luminescence channel
        """
        img_f = np.float32(img)
        img_lumi = 0.299 * img_f[:, :, 0] + \
                   0.587 * img_f[:, :, 1] + \
                   0.114 * img_f[:, :, 2]
        return np.uint8(img_lumi)

    def reset(self):
        """Reset the game
        """
        self.ale.reset_game()
        s = self.observe_raw(get_rgb=True)
        s = self.crop_frame(s)

        # Populate missing frames with blank ones.
        for _ in range(self.stack_num_states - 1):
            self.state_fifo.append(np.zeros(shape=(s.shape[0], s.shape[1])))

        self.latest_frame_fifo.append(s)

        # Push the latest frame
        curmax_frame = s
        frame_lumi = self.convert_to_gray(s)
        self.state_fifo.append(frame_lumi)

        self.state = np.transpose(self.state_fifo, (1, 2, 0))
        self.state = cv2.resize(self.state, (84, 84))
        return self.state

    def get_action_meanings(self):
        """Return in text what the actions correspond to.
        """
        return [ACTION_MEANING[i] for i in self._action_set]

    def save_state(self):
        """Saves the current state and returns a identifier to saved state
        """
        return self.ale.cloneSystemState()

    def restore_state(self, ident):
        """Restore game state
        Restores the saved state of the system and perform a no-op
        so a new frame can be generated incase a restore is followed
        by an observe()
        """

        self.ale.restoreSystemState(ident)
        self.step(0, force_noop=True)
예제 #2
0
class GameState(object):
  def __init__(self, rand_seed, options, display=False, no_op_max=30, thread_index=-1):
    if options.use_gym:
      self._display = options.display
    else:
      self.ale = ALEInterface()
      self.ale.setInt(b'random_seed', rand_seed)
      self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability)
      self.ale.setInt(b'frame_skip', options.frames_skip_in_ale)
      self.ale.setBool(b'color_averaging', options.color_averaging_in_ale)
    self._no_op_max = no_op_max
 
    self.options = options
    self.color_maximizing = options.color_maximizing_in_gs
    self.color_averaging  = options.color_averaging_in_gs
    self.color_no_change  = options.color_no_change_in_gs
    # for screen output in _process_frame()
    self.thread_index = thread_index
    self.record_gs_screen_dir = self.options.record_gs_screen_dir
    self.episode_record_dir = None
    self.episode = 1
    self.rooms = np.zeros((24), dtype=np.int)
    self.prev_room_no = 1
    self.room_no = 1
    self.new_room = -1

    if options.use_gym:
      # see https://github.com/openai/gym/issues/349
      def _seed(self, seed=None):
        self.ale.setFloat(b'repeat_action_probability', options.repeat_action_probability)
        from gym.utils import seeding
        self.np_random, seed1 = seeding.np_random(seed)
        # Derive a random seed. This gets passed as a uint, but gets
        # checked as an int elsewhere, so we need to keep it below
        # 2**31.
        seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31
        # Empirically, we need to seed before loading the ROM.
        self.ale.setInt(b'random_seed', seed2)
        self.ale.loadROM(self.game_path)
        return [seed1, seed2]
      
      AtariEnv._seed = _seed
      self.gym = gym.make(options.gym_env)
      self.ale = self.gym.ale
      print(self.gym.action_space)
    else:
      if display:
        self._setup_display()
    
      self.ale.loadROM(options.rom.encode('ascii'))

      # collect minimal action set
      self.real_actions = self.ale.getMinimalActionSet()
      print("real_actions=", self.real_actions)
      if (len(self.real_actions) != self.options.action_size):
        print("***********************************************************")
        print("* action_size != len(real_actions)")
        print("***********************************************************")
        sys.exit(1)

    # height=210, width=160
    self._screen = np.empty((210 * 160 * 1), dtype=np.uint8)
    if (not options.use_gym) and (self.color_maximizing or self.color_averaging or self.color_no_change):
      self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8)
      self._prev_screen_RGB = np.empty((210 *  160 * 3), dtype=np.uint8)
    self._have_prev_screen_RGB = False

    # for pseudo-count
    self.psc_use = options.psc_use
    if options.psc_use:
      psc_beta = options.psc_beta
      if options.psc_beta_list is not None:
        psc_beta = options.psc_beta_list[thread_index]
      psc_pow = options.psc_pow
      if options.psc_pow_list is not None:
        psc_pow = options.psc_pow_list[thread_index]
      print("[DIVERSITY]th={}:psc_beta={}, psc_pow={}".format(thread_index, psc_beta, psc_pow))
      self.psc_frsize = options.psc_frsize
      self.psc_k = options.psc_frsize ** 2
      self.psc_range_k = np.array([i for i in range(self.psc_k)])
      self.psc_rev_pow = 1.0 / psc_pow
      self.psc_alpha = math.pow(0.1, psc_pow)
      self.psc_beta = psc_beta
      self.psc_maxval = options.psc_maxval
      if options.psc_multi:
        self.psc_vcount = np.zeros((24, self.psc_maxval + 1, self.psc_k), dtype=np.float64)
        self.psc_n = np.zeros(24, dtype=np.float64)
      else:
        self.psc_vcount = np.zeros((self.psc_maxval + 1, self.psc_k), dtype=np.float64)
        self.psc_n = 0

    self.reset()

  # for pseudo-count
  def psc_set_psc_info(self, psc_info):
    if psc_info is not None:
      self.psc_vcount = np.array(psc_info["psc_vcount"], dtype=np.float64)
      if options.psc_multi:
        self.psc_n = np.array(psc_info["psc_n"], dtype=np.float64)
      else:
        self.psc_n = psc_info["psc_n"]
 
  def psc_set_gs_info(self, gs_info):
    self.psc_vcount = np.array(gs_info["psc_vcount"], dtype=np.float64)
    if options.psc_multi:
      self.psc_n = np.array(gs_info["psc_n"], dtype=np.float64)
    else:
      self.psc_n = gs_info["psc_n"]
    self.rooms = gs_info["rooms"]
    self.episode = gs_info["episode"]
 
  # for pseudo-count
  #@profile
  def psc_add_image(self, psc_image):
    if psc_image.dtype != np.dtype('uint8'):
      print("Internal ERROR in dtype")
      sys.exit(1)
    range_k = self.psc_range_k
    if options.psc_multi:
      room_no = self.room_no
      n = self.psc_n[room_no]
    else:
      n = self.psc_n
    if n > 0:
      nr = (n + 1.0)/n
      if options.psc_multi:
        vcount = self.psc_vcount[room_no, psc_image, range_k]
        self.psc_vcount[room_no, psc_image, range_k] += 1.0
      else:
        vcount = self.psc_vcount[psc_image, range_k]
        self.psc_vcount[psc_image, range_k] += 1.0
      r_over_rp = np.prod(nr * vcount / (1.0 + vcount))
      dominator = 1.0 - r_over_rp
      if dominator <= 0.0:
        print("psc_add_image: dominator <= 0.0 : dominator=", dominator)
        dominator = 1.0e-20
      psc_count = r_over_rp / dominator
      psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha, self.psc_rev_pow)
    else:
      if options.psc_multi:
        self.psc_vcount[room_no, psc_image, range_k] += 1.0
      else:
        self.psc_vcount[psc_image, range_k] += 1.0
      psc_count = 0.0
      psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha, self.psc_rev_pow)
    
    if options.psc_multi:
      self.psc_n[room_no] += 1.0
    else:
      self.psc_n += 1

    if n % (self.options.score_log_interval * 10) == 0:
      print("[PSC]th={},psc_n={}:room={},psc_reward={:.8f},RM{:02d}".format(self.thread_index, n, self.room_no, psc_reward, self.room_no))

    return psc_reward   

  # for montezuma's revenge
  #@profile
  def update_montezuma_rooms(self):
    ram = self.ale.getRAM()
    # room_no = ram[0x83]
    room_no = ram[3]
    self.rooms[room_no] += 1
    if self.rooms[room_no] == 1:
      print("[PSC]th={} @@@ NEW ROOM({}) VISITED: visit counts={}".format(self.thread_index, room_no, self.rooms))
      self.new_room = room_no
    self.prev_room_no = self.room_no
    self.room_no = room_no

  def set_record_screen_dir(self, record_screen_dir):
    if options.use_gym:
      print("record_screen_dir", record_screen_dir)
      self.gym.monitor.start(record_screen_dir)
      self.reset()
    else:
      print("record_screen_dir", record_screen_dir)
      self.ale.setString(b'record_screen_dir', str.encode(record_screen_dir))
      self.ale.loadROM(self.options.rom.encode('ascii'))
      self.reset()

  def close_record_screen_dir(self):
    if options.use_gym:
      self.gym.monitor.close()
    else:
      pass

  #@profile
  def _process_action(self, action):
    if options.use_gym:
      observation, reward, terminal, _ = self.gym.step(action)
      return reward, terminal
    else:
      reward = self.ale.act(action)
      terminal = self.ale.game_over()
      self.terminal = terminal
      self._have_prev_screen_RGB = False
      return reward, terminal
    
  #@profile
  def _process_frame(self, action, reshape):
    if self.terminal:
      reward = 0
      terminal = True
    elif options.use_gym:
      observation, reward, terminal, _ = self.gym.step(action)
      self._screen_RGB = observation
      self.terminal = terminal
    else:
      # get previous screen
      if (self.color_maximizing or self.color_averaging) \
              and not self._have_prev_screen_RGB:
        self.ale.getScreenRGB(self._prev_screen_RGB)
        self._have_prev_screen_RGB = True

      # make action
      reward = self.ale.act(action)
      terminal = self.ale.game_over()
      self.terminal = terminal

    # screen shape is (210, 160, 1)
    if self.color_maximizing or self.color_averaging: # impossible in gym
      self.ale.getScreenRGB(self._screen_RGB)
      if self._have_prev_screen_RGB:
        if self.color_maximizing:
          screen = np.maximum(self._prev_screen_RGB, self._screen_RGB)
        else: # self.color_averaging:
          screen = np.mean((self._prev_screen_RGB, self._screen_RGB), axis=0).astype(np.uint8)
      else:
        screen = self._screen_RGB
      screen = screen.reshape((210, 160, 3))
      self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
      # swap screen_RGB
      swap_screen_RGB = self._prev_screen_RGB
      self._prev_screen_RGB = self._screen_RGB
      self._screen_RGB = swap_screen_RGB
      self._have_prev_screen_RGB = True
    elif self.color_no_change:
      if not options.use_gym:
        self.ale.getScreenRGB(self._screen_RGB)
      screen = self._screen_RGB
      screen = screen.reshape((210, 160, 3))
      self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
    else:
      self.ale.getScreenGrayscale(self._screen)
    
    # reshape it into (210, 160)
    reshaped_screen = np.reshape(self._screen, (210, 160))
    
    # set uncropped frame for screen output
    self.uncropped_screen = reshaped_screen

    # resize to height=110, width=84
    if self.options.crop_frame:
      resized_screen = cv2.resize(reshaped_screen, (84, 110))
      x_t = resized_screen[18:102,:]
    else:
      x_t = cv2.resize(reshaped_screen, (84, 84))
    x_t_uint8 = x_t
    
    if reshape:
      x_t = np.reshape(x_t, (84, 84, 1))
    x_t = x_t.astype(np.float32)
    x_t *= (1.0/255.0)
    return reward, terminal, x_t, x_t_uint8

  #@profile
  def pseudo_count(self, x_t):
    # update covered rooms
    if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0":
      self.update_montezuma_rooms()
    
    psc_reward = 0.0
    if self.psc_use:
      psc_image = cv2.resize(x_t, (self.psc_frsize, self.psc_frsize))
      psc_image = np.reshape(psc_image, (self.psc_k))
      psc_image = np.uint8(psc_image * (self.psc_maxval / 255.0))
      psc_reward = self.psc_add_image(psc_image)

    # update covered rooms
    if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0":
      self.update_montezuma_rooms()
    
    return psc_reward
    
  def _setup_display(self):
    if sys.platform == 'darwin':
      import pygame
      pygame.init()
      self.ale.setBool(b'sound', False)
    elif sys.platform.startswith('linux'):
      self.ale.setBool(b'sound', True)
    self.ale.setBool(b'display_screen', True)

  def reset(self):
    if options.use_gym:
      self.gym.reset()
    else:
      self.ale.reset_game()
    
    # randomize initial state
    if self._no_op_max > 0:
      no_op = np.random.randint(0, self._no_op_max // self.options.frames_skip_in_ale + 1)
      if options.use_gym:
        no_op = no_op // 3 # gym skip 2 - 4 frame randomly
      for _ in range(no_op):
        if options.use_gym:
          self.gym.step(0)
        else:
          self.ale.act(0)

    self._have_prev_screen_RGB = False
    self.terminal = False
    _, _, x_t, x_t_uint8 = self._process_frame(0, False)
    _ = self.pseudo_count(x_t_uint8)
    
    self.reward = 0
    self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    self.lives = float(self.ale.lives())
    self.initial_lives = self.lives

    if (self.thread_index == 0) and (self.record_gs_screen_dir is not None):
      episode_dir = "episode{:03d}".format(self.episode)
      self.episode_record_dir = os.path.join(self.record_gs_screen_dir, episode_dir)
      os.makedirs(self.episode_record_dir)
      self.episode += 1
      self.stepNo = 1
      print("game_state: writing screen images to ", self.episode_record_dir)

    self.new_room = -1
    
  #@profile
  def process(self, action):
    if options.use_gym:
      real_action = action
      if self._display:
        self.gym.render()
    else:
      # convert original 18 action index to minimal action set index
      real_action = self.real_actions[action]
    reward = 0

    if self.options.stack_frames_in_gs:
      s_t1 = []
      terminal = False
      for _ in range(self.options.frames_skip_in_gs):
        if not terminal:
          r, t, x_t1, x_t_uint8 = self._process_frame(real_action, False)
          reward = reward + r
          terminal = terminal or t
        s_t1.append(x_t1)
      self.s_t1 = np.stack(s_t1, axis = 2)
      # for _ in range(self.options.frames_skip_in_gs):
      #   r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True)
      #   reward = reward + r
      #   self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2)
      #   if t:
      #     break
    else:
      # altered for speed up (reduce getScreen and color_maximizing)
      for _ in range(self.options.frames_skip_in_gs - 1):
        r, t = self._process_action(real_action)
        reward = reward + r
        if t:
          self.terminal = True
          break

      r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True)
      reward = reward + r
      self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2)

    self.reward = reward
    self.terminal = t

    self.psc_reward = self.pseudo_count(x_t_uint8)
    self.lives = float(self.ale.lives())

    if self.episode_record_dir is not None:
      filename = "{:06d}.png".format(self.stepNo)
      filename = os.path.join(self.episode_record_dir, filename)
      self.stepNo += 1
      screen_image = x_t1.reshape((84, 84)) * 255.
      cv2.imwrite(filename, screen_image)


  def update(self):
    self.s_t = self.s_t1
예제 #3
0
class VPG(object):
    
    def __init__(self):
        
        self.ale = ALEInterface();
        self.ale.loadROM(get_game_path('boxing'));
        self.legal_actions = self.ale.getMinimalActionSet();
        self.policyModel = PolicyModel(self.legal_actions);
        #load model
        if True == os.path.exists('model'): self.policyModel.load_weights('./model/vpg_model');
        self.status_size_ = 4
        self.gamma_ = 1; #the reward it too small
        
    def status2tensor(self,status):
        
        status = tf.convert_to_tensor(status, dtype = tf.float32);
        status = tf.transpose(status,[1,2,0]);
        status = tf.expand_dims(status,0);
        return status;
        
    def preprocess(self, image):
        
        frame = image[25:185,:,:];
        frame = cv2.resize(frame,(84,84)) / 255.0;
        return frame;
        
    def PlayOneEpisode(self):
        
        self.ale.reset_game();
        trajectory = list();
        status = list();
        # initial status
        for i in range(self.status_size_):
            current_frame = self.preprocess(self.ale.getScreenGrayscale());
            status.append(current_frame);
            assert False == self.ale.game_over();
        # play until game over
        while False == self.ale.game_over():
            # display screen
            cv2.imshow('screen',self.ale.getScreenRGB());
            cv2.waitKey(10);
            # choose action 
            input = self.status2tensor(status);
            V, P = self.policyModel(input);
            action_index = tf.random.categorical(P,1);
            reward = 0;
            for i in range(self.status_size_):
                reward += self.ale.act(self.legal_actions[action_index]);
            current_frame = self.preprocess(self.ale.getScreenGrayscale());
            status.append(current_frame);
            game_over = self.ale.game_over();
            trajectory.append((status[0:self.status_size_],action_index,reward,status[1:],game_over));
            status = status[1:];
        total_reward = 0;
        for status in reversed(trajectory):
            total_reward = status[2] + self.gamma_ * total_reward;
        return trajectory, total_reward;
    
    def train(self, loop_time = 1000):
        
        optimizer = tf.keras.optimizers.Adam(1e-3);
        # setup checkpoint and log utils
        checkpoint = tf.train.Checkpoint(model = self.policyModel, optimizer = optimizer, optimizer_step = optimizer.iterations);
        checkpoint.restore(tf.train.latest_checkpoint('checkpoints_vpg'));
        log = tf.summary.create_file_writer('checkpoints_vpg');
        for i in range(loop_time):
            trajectory, total_reward = self.PlayOneEpisode();
            avg_policy_loss = tf.keras.metrics.Mean(name = 'policy loss', dtype = tf.float32);
            avg_value_loss = tf.keras.metrics.Mean(name = 'value loss', dtype = tf.float32);
            for status in trajectory:
                # policy loss
                with tf.GradientTape() as tape:
                    Vt, Pt = self.policyModel(self.status2tensor(status[0]));
                    Vtp1, Ptp1 = self.policyModel(self.status2tensor(status[3]));
                    action_mask = tf.one_hot(status[1],len(self.legal_actions));
                    log_probs = tf.math.reduce_sum(action_mask * tf.math.log(Pt), axis = 1);
                    advantage = -Vt + status[2] + self.gamma_ * Vtp1;
                    policy_loss = -tf.math.reduce_mean(log_probs * advantage);
                    value_loss = tf.math.squared_difference(Vt, total_reward);
                    loss = policy_loss + value_loss;
                    avg_policy_loss.update_state(policy_loss);
                    avg_value_loss.update_state(value_loss);
                # write loss to summary
                if tf.equal(optimizer.iterations % 100, 0):
                    with log.as_default():
                        tf.summary.scalar('policy loss',avg_policy_loss.result(), step = optimizer.iterations);
                        tf.summary.scalar('value loss',avg_value_loss.result(), step = optimizer.iterations);
                    avg_policy_loss.reset_states();
                    avg_value_loss.reset_states();
                # train policy and value
                grads = tape.gradient(loss,self.policyModel.variables);
                optimizer.apply_gradients(zip(grads,self.policyModel.variables));
            # save model every episode
            checkpoint.save(os.path.join('checkpoints_vpg','ckpt'));
        # save final model
        if False == os.path.exists('model'): os.mkdir('model');
        #tf.saved_model.save(self.policyModel,'./model/vpg_model');
        self.policyModel.save_weights('./model/vpg_model');
예제 #4
0
class DQN(object):
    
    SHOW = False;
    SCALE = 10000;
    MEMORY_LIMIT = 4 * SCALE;
    BATCH_SIZE  = 32;
    BURNIN_STEP = 5 * SCALE;
    TRAIN_FREQUENCY = 4;
    UPDATE_FREQUENCY = SCALE;
    STATUS_SIZE = 4;
    GAMMA = 0.99;
    TEST_INTERVAL = 1000;
    ep_end = 0.1;
    ep_start = 1.;
    ep_end_t = MEMORY_LIMIT;
    learn_start = 5 * SCALE;
    
    def __init__(self):
        
        # ale related members
        self.ale = ALEInterface();
        self.ale.loadROM(get_game_path('boxing'));
        self.legal_actions = self.ale.getMinimalActionSet();
        self.status = list();
        # use qnet_latest to hold the latest updated weights 
        self.qnet_latest = QNet(len(self.legal_actions));
        # use qnet_target to hold the target model weights
        self.qnet_target = QNet(len(self.legal_actions));
        if True == os.path.exists('model'):
            self.qnet_latest.load_weights('./model/dqn_model');
        # use qnet_target as the rollout model
        self.qnet_target.set_weights(self.qnet_latest.get_weights());
        # loss
        self.loss = Loss(len(self.legal_actions), self.GAMMA);
        # status transition memory
        self.memory = list();
        # optimizer
        self.optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(0.00025, 5 * self.SCALE, 0.96));
        # episode count
        self.ep_count = 0;

    def convertImgToTensor(self,status):
        
        status = tf.constant(status, dtype = tf.float32); # status.shape = (4, 48, 48)
        status = tf.transpose(status, (1, 2, 0)); # status.shape = (48, 48, 4)
        status = tf.expand_dims(status, axis = 0); # status.shape = (1, 48, 48, 4)
        return status;
    
    def convertBatchToTensor(self,batch):
        
        st, at, rt, stp1, et = zip(*batch);
        # st.shape = batchsize*[1,48,48,4]
        st = tf.squeeze(tf.concat(st, axis = 0));
        at = tf.squeeze(tf.concat(at, axis = 0));
        rt = tf.squeeze(tf.concat(rt, axis = 0));
        stp1 = tf.squeeze(tf.concat(stp1, axis = 0));
        et = tf.squeeze(tf.concat(et, axis = 0));
        return (st,at,rt,stp1,et);
    
    def getObservation(self):
        
        image = self.ale.getScreenGrayscale();
        frame = image[25:185,:,:];
        frame = cv2.resize(frame,(84,84)) / 255.0;
        return frame;
    
    def remember(self, transition):
        
        if len(self.memory) > self.MEMORY_LIMIT: self.memory.pop(0);
        self.memory.append(transition);
    
    def reset_game(self):
        
        self.ale.reset_game();
        self.status = list();
        for i in range(self.STATUS_SIZE):
            current_frame = self.getObservation();
            self.status.append(current_frame);
            assert False == self.ale.game_over();

    def rollout(self):
        
        if self.ale.game_over() or len(self.status) != self.STATUS_SIZE:
            self.reset_game();
        if self.SHOW:
            # display screen
            cv2.imshow('screen',self.ale.getScreenRGB());
            cv2.waitKey(1);
        # choose action 
        st = self.convertImgToTensor(self.status);
        Qt = self.qnet_target(st); # Qt.shape = (1, action_num)
        ep = self.ep_end + max(0., (self.ep_start - self.ep_end) * (self.ep_end_t - max(0., int(self.optimizer.iterations) - self.learn_start)) / self.ep_end_t);
        if np.random.uniform(low = 0., high = 1., size = ()) < ep:
            # explore at first
            action_index = tf.constant(np.random.randint(low = 0, high = len(self.legal_actions), size = (1,1)), dtype = tf.int64); # action_index.shape = (1, 1)
        else:
            # exploit at last
            action_index = tf.random.categorical(tf.keras.layers.Softmax(axis = -1)(Qt),1); # action_index.shape = (1, 1)
        reward = 0;
        for i in range(self.STATUS_SIZE):
            reward += self.ale.act(self.legal_actions[action_index]);
        self.status.append(self.getObservation());
        self.status.pop(0);
        stp1 = self.convertImgToTensor(self.status);
        game_over = 1. if self.ale.game_over() else 0.;
        self.remember((st, action_index, float(reward), stp1, game_over));
        return game_over;
    
    def train(self, loop_time = 10000000):

        # setup checkpoint and log utils
        checkpoint = tf.train.Checkpoint(model = self.qnet_target, optimizer = self.optimizer);
        checkpoint.restore(tf.train.latest_checkpoint('checkpoints_dqn'));
        log = tf.summary.create_file_writer('checkpoints_dqn');
        avg_reward = tf.keras.metrics.Mean(name = 'reward', dtype = tf.float32);
        self.reset_game();
        for i in range(loop_time):
            game_over = self.rollout();
            if game_over: self.ep_count += 1;
            if game_over and self.ep_count % self.TEST_INTERVAL == 0:
                # evaluate the updated model
                for i in range(10): avg_reward.update_state(self.eval(steps = 1000));
                with log.as_default():
                    tf.summary.scalar('reward', avg_reward.result(), step = self.optimizer.iterations);
                print('Step #%d Reward: %.6f lr: %.6f' % (self.optimizer.iterations, avg_reward.result(), self.optimizer._hyper['learning_rate'](self.optimizer.iterations)));
                avg_reward.reset_states();
            # do nothing if collected samples are not enough
            if i < self.BURNIN_STEP or len(self.memory) < self.BATCH_SIZE:
                continue;
            # update qnet_latest at certain frequency
            if i % self.TRAIN_FREQUENCY == 0:
                avg_loss = tf.keras.metrics.Mean(name = 'loss', dtype = tf.float32);
                # random sample from memory
                batch = random.sample(self.memory, self.BATCH_SIZE);
                st, at, rt, stp1, et = self.convertBatchToTensor(batch);
                # policy loss
                with tf.GradientTape() as tape:
                    Qt = self.qnet_latest(st);
                    Qtp1 = self.qnet_latest(stp1);
                    loss = self.loss([Qt, Qtp1, rt, at, et]);
                    avg_loss.update_state(loss);
                # write loss to summary
                if tf.equal(self.optimizer.iterations % 100, 0):
                    with log.as_default():
                        tf.summary.scalar('loss',avg_loss.result(), step = self.optimizer.iterations);
                    avg_loss.reset_states();
                # train qnet_latest
                grads = tape.gradient(loss,self.qnet_latest.trainable_variables);
                self.optimizer.apply_gradients(zip(grads,self.qnet_latest.trainable_variables));
            # save model every episode
            if i % self.UPDATE_FREQUENCY == 0:
                self.qnet_target.set_weights(self.qnet_latest.get_weights());
                checkpoint.save(os.path.join('checkpoints_dqn','ckpt'));
        # save final model
        if False == os.path.exists('model'): os.mkdir('model');
        #tf.saved_model.save(self.qnet,'./model/vpg_model');
        self.qnet_target.save_weights('./model/dqn_model');

    def eval(self, steps = None):
        self.ale.reset_game();
        # play one episode
        total_reward = 0;
        step = 0;
        while False == self.ale.game_over() and (steps is None or step < steps):
            if self.SHOW:
                # display screen
                cv2.imshow('screen',self.ale.getScreenRGB());
                cv2.waitKey(1);
            st = self.convertImgToTensor(status);
            Qt = self.qnet_latest(st);
            action_index = tf.random.categorical(tf.keras.layers.Softmax(axis = -1)(Qt),1);
            for i in range(self.STATUS_SIZE):
                total_reward += self.ale.act(self.legal_actions[action_index]);
            status.append(self.getObservation());
            status.pop(0);
            step += 1;
        return total_reward;
예제 #5
0
class GameState(object):
    def __init__(self,
                 rand_seed,
                 options,
                 display=False,
                 no_op_max=30,
                 thread_index=-1):
        if options.use_gym:
            self._display = options.display
        else:
            self.ale = ALEInterface()
            self.ale.setInt(b'random_seed', rand_seed)
            self.ale.setFloat(b'repeat_action_probability',
                              options.repeat_action_probability)
            self.ale.setInt(b'frame_skip', options.frames_skip_in_ale)
            self.ale.setBool(b'color_averaging',
                             options.color_averaging_in_ale)
        self._no_op_max = no_op_max

        self.options = options
        self.color_maximizing = options.color_maximizing_in_gs
        self.color_averaging = options.color_averaging_in_gs
        self.color_no_change = options.color_no_change_in_gs
        # for screen output in _process_frame()
        self.thread_index = thread_index
        self.record_gs_screen_dir = self.options.record_gs_screen_dir
        self.episode_record_dir = None
        self.episode = 1
        self.rooms = np.zeros((24), dtype=np.int)
        self.prev_room_no = 1
        self.room_no = 1
        self.new_room = -1

        if options.use_gym:
            # see https://github.com/openai/gym/issues/349
            def _seed(self, seed=None):
                self.ale.setFloat(b'repeat_action_probability',
                                  options.repeat_action_probability)
                from gym.utils import seeding
                self.np_random, seed1 = seeding.np_random(seed)
                # Derive a random seed. This gets passed as a uint, but gets
                # checked as an int elsewhere, so we need to keep it below
                # 2**31.
                seed2 = seeding.hash_seed(seed1 + 1) % 2**31
                # Empirically, we need to seed before loading the ROM.
                self.ale.setInt(b'random_seed', seed2)
                self.ale.loadROM(self.game_path)
                return [seed1, seed2]

            AtariEnv._seed = _seed
            self.gym = gym.make(options.gym_env)
            self.ale = self.gym.ale
            print(self.gym.action_space)
        else:
            if display:
                self._setup_display()

            self.ale.loadROM(options.rom.encode('ascii'))

            # collect minimal action set
            self.real_actions = self.ale.getMinimalActionSet()
            print("real_actions=", self.real_actions)
            if (len(self.real_actions) != self.options.action_size):
                print(
                    "***********************************************************"
                )
                print("* action_size != len(real_actions)")
                print(
                    "***********************************************************"
                )
                sys.exit(1)

        # height=210, width=160
        self._screen = np.empty((210 * 160 * 1), dtype=np.uint8)
        if (not options.use_gym) and (self.color_maximizing
                                      or self.color_averaging
                                      or self.color_no_change):
            self._screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8)
            self._prev_screen_RGB = np.empty((210 * 160 * 3), dtype=np.uint8)
        self._have_prev_screen_RGB = False

        # for pseudo-count
        self.psc_use = options.psc_use
        if options.psc_use:
            self.psc_frsize = options.psc_frsize
            self.psc_k = options.psc_frsize**2
            self.psc_rev_pow = 1.0 / options.psc_pow
            self.psc_alpha = math.pow(0.1, options.psc_pow)
            self.psc_beta = options.psc_beta
            self.psc_maxval = options.psc_maxval
            self.psc_vcount = np.zeros((self.psc_k, self.psc_maxval + 1),
                                       dtype=np.float64)
            self.psc_n = 0

        self.reset()

    # for pseudo-count
    def psc_set_psc_info(self, psc_info):
        if psc_info["psc_n"] != 0:
            self.psc_vcount = np.array(psc_info["psc_vcount"],
                                       dtype=np.float64)
            self.psc_n = psc_info["psc_n"]

    def psc_set_gs_info(self, gs_info):
        self.psc_vcount = np.array(gs_info["psc_vcount"], dtype=np.float64)
        self.psc_n = gs_info["psc_n"]
        self.rooms = gs_info["rooms"]
        self.episode = gs_info["episode"]

    # for pseudo-count
    def psc_add_image(self, psc_image):
        if psc_image.dtype != np.dtype('uint8'):
            print("Internal ERROR in dtype")
            sys.exit(1)
        k = self.psc_k
        n = self.psc_n
        if n > 0:
            nr = (n + 1.0) / n
            vcount = self.psc_vcount[range(k), psc_image]
            self.psc_vcount[range(k), psc_image] += 1.0
            r_over_rp = np.prod(nr * vcount / (1.0 + vcount))
            dominator = 1.0 - r_over_rp
            if dominator <= 0.0:
                print("psc_add_image: dominator <= 0.0 : dominator=",
                      dominator)
                dominator = 1.0e-20
            psc_count = r_over_rp / dominator
            psc_reward = self.psc_beta / math.pow(psc_count + self.psc_alpha,
                                                  self.psc_rev_pow)
        else:
            self.psc_vcount[range(k), psc_image] += 1.0
            psc_reward = 0.0

        self.psc_n += 1

        if self.psc_n % (self.options.score_log_interval * 10) == 0:
            room = -1
            if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0":
                ram = self.ale.getRAM()
                room = ram[3]
            print("[PSC]th={},psc_n={}:room={},psc_reward={:.8f},RM{:02d}".
                  format(self.thread_index, self.psc_n, room, psc_reward,
                         self.room_no))

        return psc_reward

    # for montezuma's revenge
    def update_montezuma_rooms(self):
        ram = self.ale.getRAM()
        # room_no = ram[0x83]
        room_no = ram[3]
        self.rooms[room_no] += 1
        if self.rooms[room_no] == 1:
            print(
                "[PSC]th={} @@@ NEW ROOM({}) VISITED: visit counts={}".format(
                    self.thread_index, room_no, self.rooms))
            self.new_room = room_no
        self.prev_room_no = self.room_no
        self.room_no = room_no

    def set_record_screen_dir(self, record_screen_dir):
        if options.use_gym:
            print("record_screen_dir", record_screen_dir)
            self.gym.monitor.start(record_screen_dir)
            self.reset()
        else:
            print("record_screen_dir", record_screen_dir)
            self.ale.setString(b'record_screen_dir',
                               str.encode(record_screen_dir))
            self.ale.loadROM(self.options.rom.encode('ascii'))
            self.reset()

    def close_record_screen_dir(self):
        if options.use_gym:
            self.gym.monitor.close()
        else:
            pass

    def _process_action(self, action):
        if options.use_gym:
            observation, reward, terminal, _ = self.gym.step(action)
            return reward, terminal
        else:
            reward = self.ale.act(action)
            terminal = self.ale.game_over()
            self.terminal = terminal
            self._have_prev_screen_RGB = False
            return reward, terminal

    def _process_frame(self, action, reshape):
        if self.terminal:
            reward = 0
            terminal = True
        elif options.use_gym:
            observation, reward, terminal, _ = self.gym.step(action)
            self._screen_RGB = observation
            self.terminal = terminal
        else:
            # get previous screen
            if (self.color_maximizing or self.color_averaging) \
                    and not self._have_prev_screen_RGB:
                self.ale.getScreenRGB(self._prev_screen_RGB)
                self._have_prev_screen_RGB = True

            # make action
            reward = self.ale.act(action)
            terminal = self.ale.game_over()
            self.terminal = terminal

        # screen shape is (210, 160, 1)
        if self.color_maximizing or self.color_averaging:  # impossible in gym
            self.ale.getScreenRGB(self._screen_RGB)
            if self._have_prev_screen_RGB:
                if self.color_maximizing:
                    screen = np.maximum(self._prev_screen_RGB,
                                        self._screen_RGB)
                else:  # self.color_averaging:
                    screen = np.mean((self._prev_screen_RGB, self._screen_RGB),
                                     axis=0).astype(np.uint8)
            else:
                screen = self._screen_RGB
            screen = screen.reshape((210, 160, 3))
            self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
            # swap screen_RGB
            swap_screen_RGB = self._prev_screen_RGB
            self._prev_screen_RGB = self._screen_RGB
            self._screen_RGB = swap_screen_RGB
            self._have_prev_screen_RGB = True
        elif self.color_no_change:
            if not options.use_gym:
                self.ale.getScreenRGB(self._screen_RGB)
            screen = self._screen_RGB
            screen = screen.reshape((210, 160, 3))
            self._screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
        else:
            self.ale.getScreenGrayscale(self._screen)

        # reshape it into (210, 160)
        reshaped_screen = np.reshape(self._screen, (210, 160))

        # set uncropped frame for screen output
        self.uncropped_screen = reshaped_screen

        # resize to height=110, width=84
        if self.options.crop_frame:
            resized_screen = cv2.resize(reshaped_screen, (84, 110))
            x_t = resized_screen[18:102, :]
        else:
            x_t = cv2.resize(reshaped_screen, (84, 84))
        x_t_uint8 = x_t

        if reshape:
            x_t = np.reshape(x_t, (84, 84, 1))
        x_t = x_t.astype(np.float32)
        x_t *= (1.0 / 255.0)
        return reward, terminal, x_t, x_t_uint8

    def pseudo_count(self, x_t):
        psc_reward = 0.0
        if self.psc_use:
            psc_image = cv2.resize(x_t, (self.psc_frsize, self.psc_frsize))
            psc_image = np.reshape(psc_image, (self.psc_k))
            psc_image = np.uint8(psc_image * (self.psc_maxval / 255.0))
            psc_reward = self.psc_add_image(psc_image)

        # update covered rooms
        if self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0":
            self.update_montezuma_rooms()

        return psc_reward

    def _setup_display(self):
        if sys.platform == 'darwin':
            import pygame
            pygame.init()
            self.ale.setBool(b'sound', False)
        elif sys.platform.startswith('linux'):
            self.ale.setBool(b'sound', True)
        self.ale.setBool(b'display_screen', True)

    def reset(self):
        if options.use_gym:
            self.gym.reset()
        else:
            self.ale.reset_game()

        # randomize initial state
        if self._no_op_max > 0:
            no_op = np.random.randint(
                0, self._no_op_max // self.options.frames_skip_in_ale + 1)
            if options.use_gym:
                no_op = no_op // 3  # gym skip 2 - 4 frame randomly
            for _ in range(no_op):
                if options.use_gym:
                    self.gym.step(0)
                else:
                    self.ale.act(0)

        self._have_prev_screen_RGB = False
        self.terminal = False
        _, _, x_t, x_t_uint8 = self._process_frame(0, False)
        _ = self.pseudo_count(x_t_uint8)

        self.reward = 0
        self.s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

        self.lives = float(self.ale.lives())
        self.initial_lives = self.lives

        if (self.thread_index == 0) and (self.record_gs_screen_dir
                                         is not None):
            episode_dir = "episode{:03d}".format(self.episode)
            self.episode_record_dir = os.path.join(self.record_gs_screen_dir,
                                                   episode_dir)
            os.makedirs(self.episode_record_dir)
            self.episode += 1
            self.stepNo = 1
            print("game_state: writing screen images to ",
                  self.episode_record_dir)

        self.new_room = -1

    def process(self, action):
        if options.use_gym:
            real_action = action
            if self._display:
                self.gym.render()
        else:
            # convert original 18 action index to minimal action set index
            real_action = self.real_actions[action]
        reward = 0

        if self.options.stack_frames_in_gs:
            s_t1 = []
            terminal = False
            for _ in range(self.options.frames_skip_in_gs):
                if not terminal:
                    r, t, x_t1, x_t_uint8 = self._process_frame(
                        real_action, False)
                    reward = reward + r
                    terminal = terminal or t
                s_t1.append(x_t1)
            self.s_t1 = np.stack(s_t1, axis=2)
            # for _ in range(self.options.frames_skip_in_gs):
            #   r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True)
            #   reward = reward + r
            #   self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2)
            #   if t:
            #     break
        else:
            # altered for speed up (reduce getScreen and color_maximizing)
            for _ in range(self.options.frames_skip_in_gs - 1):
                r, t = self._process_action(real_action)
                reward = reward + r
                if t:
                    self.terminal = True
                    break

            r, t, x_t1, x_t_uint8 = self._process_frame(real_action, True)
            reward = reward + r
            self.s_t1 = np.append(self.s_t[:, :, 1:], x_t1, axis=2)

        self.reward = reward
        self.terminal = t

        self.psc_reward = self.pseudo_count(x_t_uint8)
        self.lives = float(self.ale.lives())

        if self.episode_record_dir is not None:
            filename = "{:06d}.png".format(self.stepNo)
            filename = os.path.join(self.episode_record_dir, filename)
            self.stepNo += 1
            screen_image = x_t1.reshape((84, 84)) * 255.
            cv2.imwrite(filename, screen_image)

    def update(self):
        self.s_t = self.s_t1
예제 #6
0
class AtariPlayer(gym.Env):
    """
    A wrapper for ALE emulator, with configurations to mimic DeepMind DQN settings.
    Info:
        score: the accumulated reward in the current game
        gameOver: True when the current game is Over
    """
    def __init__(self,
                 rom_file,
                 viz=0,
                 frame_skip=4,
                 nullop_start=30,
                 live_lost_as_eoe=True,
                 max_num_frames=0):
        """
        Args:
            rom_file: path to the rom
            frame_skip: skip every k frames and repeat the action
            viz: visualization to be done.
                Set to 0 to disable.
                Set to a positive number to be the delay between frames to show.
                Set to a string to be a directory to store frames.
            nullop_start: start with random number of null ops.
            live_losts_as_eoe: consider lost of lives as end of episode. Useful for training.
            max_num_frames: maximum number of frames per episode.
        """
        super(AtariPlayer, self).__init__()
        assert os.path.isfile(rom_file), \
            "rom {} not found. Please download at {}".format(rom_file, ROM_URL)

        try:
            ALEInterface.setLoggerMode(ALEInterface.Logger.Error)
        except AttributeError:
            print("You're not using latest ALE")

        # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86
        with _ALE_LOCK:
            self.ale = ALEInterface()
            self.ale.setInt(b"random_seed", np.random.randint(0, 30000))
            self.ale.setInt(b"max_num_frames_per_episode", max_num_frames)
            self.ale.setBool(b"showinfo", False)

            self.ale.setInt(b"frame_skip", 1)
            self.ale.setBool(b'color_averaging', False)
            # manual.pdf suggests otherwise.
            self.ale.setFloat(b'repeat_action_probability', 0.0)

            # viz setup
            if isinstance(viz, str):
                assert os.path.isdir(viz), viz
                self.ale.setString(b'record_screen_dir', viz)
                viz = 0
            if isinstance(viz, int):
                viz = float(viz)
            self.viz = viz
            if self.viz and isinstance(self.viz, float):
                self.windowname = os.path.basename(rom_file)
                cv2.startWindowThread()
                cv2.namedWindow(self.windowname)

            self.ale.loadROM(rom_file.encode('utf-8'))
        self.width, self.height = self.ale.getScreenDims()
        self.actions = self.ale.getMinimalActionSet()

        self.live_lost_as_eoe = live_lost_as_eoe
        self.frame_skip = frame_skip
        self.nullop_start = nullop_start

        self.action_space = spaces.Discrete(len(self.actions))
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.height, self.width),
                                            dtype=np.uint8)
        self._restart_episode()

    def get_action_meanings(self):
        return [ACTION_MEANING[i] for i in self.actions]

    def _grab_raw_image(self):
        """
        :returns: the current 3-channel image
        """
        m = self.ale.getScreenRGB()
        return m.reshape((self.height, self.width, 3))

    def _current_state(self):
        """
        returns: a gray-scale (h, w) uint8 image
        """
        ret = self._grab_raw_image()
        # avoid missing frame issue: max-pooled over the last screen
        ret = np.maximum(ret, self.last_raw_screen)
        if self.viz:
            if isinstance(self.viz, float):
                cv2.imshow(self.windowname, ret)
                cv2.waitKey(int(self.viz * 1000))
        ret = ret.astype('float32')
        # 0.299,0.587.0.114. same as rgb2y in torch/image
        ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY)
        return ret.astype('uint8')  # to save some memory

    def _restart_episode(self):
        with _ALE_LOCK:
            self.ale.reset_game()

        # random null-ops start
        n = np.random.randint(self.nullop_start)
        self.last_raw_screen = self._grab_raw_image()
        for k in range(n):
            if k == n - 1:
                self.last_raw_screen = self._grab_raw_image()
            self.ale.act(0)

    def reset(self):
        if self.ale.game_over():
            self._restart_episode()
        return self._current_state()

    def step(self, act):
        oldlives = self.ale.lives()
        r = 0
        for k in range(self.frame_skip):
            if k == self.frame_skip - 1:
                self.last_raw_screen = self._grab_raw_image()
            r += self.ale.act(self.actions[act])
            newlives = self.ale.lives()
            if self.ale.game_over() or \
                    (self.live_lost_as_eoe and newlives < oldlives):
                break

        isOver = self.ale.game_over()
        if self.live_lost_as_eoe:
            isOver = isOver or newlives < oldlives

        info = {'ale.lives': newlives}
        return self._current_state(), r, isOver, info