class pyrlcade_environment(object): def init(self,rom_file,ale_frame_skip): self.ale = ALEInterface() self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode"); self.ale.set("random_seed",123) self.ale.set("disable_color_averaging",1) self.ale.set("frame_skip",ale_frame_skip) self.ale.loadROM(rom_file) self.legal_actions = self.ale.getMinimalActionSet() ram_size = self.ale.getRAMSize() self.ram = np.zeros((ram_size),dtype=np.uint8) self.ale.getRAM(self.ram) self.state = self.ale.getRAM(self.ram) def reset_state(self): self.ale.reset_game() def set_action(self,a): self.action = a def step(self): self.reward = self.ale.act(self.action) is_terminal = self.ale.game_over() return is_terminal def get_state(self): self.ale.getRAM(self.ram) return self.ram def get_reward(self): return self.reward
class Environment: def __init__(self, rom_file, args): self.ale = ALEInterface() if args.display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) if args.random_seed: self.ale.setInt('random_seed', args.random_seed) if args.record_screen_path: if not os.path.exists(args.record_screen_path): logger.info("Creating folder %s" % args.record_screen_path) os.makedirs(args.record_screen_path) logger.info("Recording screens to %s", args.record_screen_path) self.ale.setString('record_screen_dir', args.record_screen_path) if args.record_sound_filename: logger.info("Recording sound to %s", args.record_sound_filename) self.ale.setBool('sound', True) self.ale.setString('record_sound_filename', args.record_sound_filename) self.ale.loadROM(rom_file) if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() logger.info("Using minimal action set with size %d" % len(self.actions)) else: self.actions = self.ale.getLegalActionSet() logger.info("Using full action set with size %d" % len(self.actions)) logger.debug("Actions: " + str(self.actions)) self.dims = (args.screen_height, args.screen_width) def numActions(self): return len(self.actions) def restart(self): self.ale.reset_game() def act(self, action): reward = self.ale.act(self.actions[action]) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, self.dims) return resized def isTerminal(self): return self.ale.game_over()
class Emulator(object): def __init__(self, settings): self.ale = ALEInterface() self.ale.setInt('frame_skip', settings['frame_skip']) self.ale.setInt('random_seed', np.random.RandomState().randint(1000)) self.ale.setBool('color_averaging', False) self.ale.loadROM('roms/' + settings['rom_name']) self.actions = self.ale.getMinimalActionSet() self.width = settings['screen_width'] self.height = settings['screen_height'] def reset(self): self.ale.reset_game() def image(self): screen = self.ale.getScreenGrayscale() screen = cv2.resize(screen, (self.height, self.width), interpolation=cv2.INTER_LINEAR) return np.reshape(screen, (self.height, self.width)) def full_image(self): screen = self.ale.getScreenRGB() return screen def act(self, action): return self.ale.act(self.actions[action]) def terminal(self): return self.ale.game_over()
class Emulator(object): FRAME_SKIP = 4 SCREEN_WIDTH = 84 SCREEN_HEIGHT = 84 def __init__(self, rom): self.ale = ALEInterface() self.max_num_frames_per_episode = 100000 #self.ale.getInt('max_num_frames_per_episode') self.ale.setInt('frame_skip', self.FRAME_SKIP) self.ale.loadROM('roms/' + rom) self.actions = self.ale.getMinimalActionSet() def reset(self): self.ale.reset_game() def image(self): screen = self.ale.getScreenGrayscale() screen = cv2.resize(screen, (self.SCREEN_HEIGHT, self.SCREEN_WIDTH)) return np.reshape(screen, (self.SCREEN_HEIGHT, self.SCREEN_WIDTH)) def act(self, action): return self.ale.act(action) def terminal(self): return self.ale.game_over()
class Emulate: def __init__(self, rom_file, display_screen=False,frame_skip=4,screen_height=84,screen_width=84,repeat_action_probability=0,color_averaging=True,random_seed=0,record_screen_path='screen_pics',record_sound_filename=None,minimal_action_set=True): self.ale = ALEInterface() if display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', frame_skip) self.ale.setFloat('repeat_action_probability', repeat_action_probability) self.ale.setBool('color_averaging', color_averaging) if random_seed: self.ale.setInt('random_seed', random_seed) self.ale.loadROM(rom_file) if minimal_action_set: self.actions = self.ale.getMinimalActionSet() else: self.actions = self.ale.getLegalActionSet() self.dims = (screen_width,screen_height) def numActions(self): return len(self.actions) def getActions(self): return self.actions def restart(self): self.ale.reset_game() def act(self, action): reward = self.ale.act(self.actions[action]) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, self.dims) return resized def getScreenGray(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, self.dims) rotated = np.rot90(resized,k=1) return rotated def getScreenColor(self): screen = self.ale.getScreenRGB() resized = cv2.resize(screen, self.dims) rotated = np.rot90(resized,k=1) return rotated def isTerminal(self): return self.ale.game_over()
class Emulator: def __init__(self): self.ale = ALEInterface() # turn off the sound self.ale.setBool('sound', False) self.ale.setBool('display_screen', EMULATOR_DISPLAY) self.ale.setInt('frame_skip', FRAME_SKIP) self.ale.setFloat('repeat_action_probability', REPEAT_ACTION_PROBABILITY) self.ale.setBool('color_averaging', COLOR_AVERAGING) self.ale.setInt('random_seed', RANDOM_SEED) if RECORD_SCENE_PATH: self.ale.setString('record_screen_dir', RECORD_SCENE_PATH) self.ale.loadROM(ROM_PATH) self.actions = self.ale.getMinimalActionSet() logger.info("Actions: " + str(self.actions)) self.dims = DIMS #self.start_lives = self.ale.lives() def getActions(self): return self.actions def numActions(self): return len(self.actions) def restart(self): self.ale.reset_game() # can be omitted def act(self, action): reward = self.ale.act(self.actions[action]) return reward def getScreen(self): # why grayscale ? screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, self.dims) # normalize #resized /= COLOR_SCALE return resized def isTerminal(self): # while training deepmind only ends when agent dies #terminate = DEATH_END and TRAIN and (self.ale.lives() < self.start_lives) return self.ale.game_over()
class AtariMDP(MDP, Serializable): def __init__(self, rom_path, obs_type=OBS_RAM, frame_skip=4): Serializable.__init__(self, rom_path, obs_type, frame_skip) self.options = (rom_path, obs_type, frame_skip) self.ale = ALEInterface() self.ale.loadROM(rom_path) self._rom_path = rom_path self._obs_type = obs_type self._action_set = self.ale.getMinimalActionSet() self.frame_skip = frame_skip def get_image(self): return to_rgb(self.ale) def get_ram(self): return to_ram(self.ale) def game_over(self): return self.ale.game_over() def reset_game(self): return self.ale.reset_game() @property def n_actions(self): return len(self.action_set) def get_obs(self): if self._obs_type == OBS_RAM: return self.get_ram()[None,:] else: assert self._obs_type == OBS_IMAGE return self.get_image()[None,:,:,:] def step(self, a): reward = 0.0 action = self.action_set[a] for _ in xrange(self.frame_skip): reward += self.ale.act(action) ob = self.get_obs().reshape(1,-1) return ob, np.array([reward]), self.ale.game_over() # return: (states, observations) def reset(self): self.ale.reset_game() return self.get_obs() @property def action_set(self): return self._action_set def plot(self): import cv2 cv2.imshow("atarigame",self.get_image()) #pylint: disable=E1101 cv2.waitKey(10) #pylint: disable=E1101
class Environment: def __init__(self, show_screen, history_length): self.ale = ALEInterface() self.ale.setInt('frame_skip', 4) self.history = None self.history_length = history_length if show_screen: self.display_screen() self.load_game() (screen_width, screen_height) = self.ale.getScreenDims() self.screen_data = np.empty((screen_height, screen_width, 1), dtype=np.uint8) # 210x160 screen data self.dims = (84, 84) # input size for neural network self.actions = [3, 0, 1, 4] # noop, left, right, fire, def display_screen(self): self.ale.setBool("display_screen", True) def turn_on_sound(self): self.ale.setBool("sound", True) def restart(self): """reset game""" self.ale.reset_game() def act(self, action): """:returns reward of an action""" return self.ale.act(self.actions[action]) def __get_screen(self): """:returns Grayscale thresholded resized screen image """ self.ale.getScreenGrayscale(self.screen_data) resized = cv2.resize(self.screen_data, self.dims) return resized def get_state(self): binary_screen = self.__get_screen() if self.history is None: self.history = deque(maxlen=self.history_length) for _ in range(self.history_length - 1): self.history.append(binary_screen) self.history.append(binary_screen) result = np.stack(self.history, axis=0) return result def isTerminal(self): """checks if game is over""" return self.ale.game_over() def load_game(self): """load game from file""" self.ale.loadROM("Breakout.bin")
class Breakout(object): steps_between_actions = 4 def __init__(self): self.ale = ALEInterface() self.ale.setInt('random_seed', 123) self.ale.setBool("display_screen", False) self.ale.setBool("sound", False) self.ale.loadROM("%s/breakout.bin" % rom_directory) self.current_state = [ self.ale.getScreenRGB(), self.ale.getScreenRGB() ] def start_episode(self): self.ale.reset_game() def take_action(self, action): assert not self.terminated def step(): reward = self.ale.act(action) self.roll_state() return reward reward = sum(step() for _ in xrange(self.steps_between_actions)) return (reward, self.current_state) def roll_state(self): assert len(self.current_state) == 2 self.current_state = [self.current_state[1], self.ale.getScreenRGB()] assert len(self.current_state) == 2 @property def actions(self): return self.ale.getMinimalActionSet() @property def terminated(self): return self.ale.game_over() or self.ale.lives() < 5
class emulator: def __init__(self, rom_name, vis): if vis: import cv2 self.ale = ALEInterface() self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode"); self.ale.setInt("random_seed",123) self.ale.setInt("frame_skip",4) self.ale.loadROM('roms/' + rom_name ) self.legal_actions = self.ale.getMinimalActionSet() self.action_map = dict() for i in range(len(self.legal_actions)): self.action_map[self.legal_actions[i]] = i # print(self.legal_actions) self.screen_width,self.screen_height = self.ale.getScreenDims() print("width/height: " +str(self.screen_width) + "/" + str(self.screen_height)) self.vis = vis if vis: cv2.startWindowThread() cv2.namedWindow("preview") def get_image(self): numpy_surface = np.zeros(self.screen_height*self.screen_width*3, dtype=np.uint8) self.ale.getScreenRGB(numpy_surface) image = np.reshape(numpy_surface, (self.screen_height, self.screen_width, 3)) return image def newGame(self): self.ale.reset_game() return self.get_image() def next(self, action_indx): reward = self.ale.act(action_indx) nextstate = self.get_image() # scipy.misc.imsave('test.png',nextstate) if self.vis: cv2.imshow('preview',nextstate) return nextstate, reward, self.ale.game_over()
class Game(): """ Wrapper around the ALEInterface class. """ def __init__(self, rom_file, sdl=False): self.ale = ALEInterface() # Setup SDL if sdl: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool(b'sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) # Load rom self.ale.loadROM(str.encode(rom_file)) def get_action_set(self): return self.ale.getLegalActionSet() def get_minimal_action_set(self): return self.ale.getMinimalActionSet() def game_over(self): return self.ale.game_over() def act(self, action): return self.ale.act(action) def reset_game(self): self.ale.reset_game() def get_frame(self): return self.ale.getScreenRGB()
class AleInterface(object): def __init__(self, game, args): self.game = game self.ale = ALEInterface() # if sys.platform == 'darwin': # self.ale.setBool('sound', False) # Sound doesn't work on OSX # elif sys.platform.startswith('linux'): # self.ale.setBool('sound', True) # self.ale.setBool('display_screen', True) # self.ale.setBool('display_screen', args.display_screen) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) self.ale.setInt('random_seed', args.random_seed) # # if rand_seed is not None: # self.ale.setInt('random_seed', rand_seed) rom_file = "./roms/%s.bin" % game if not os.path.exists(rom_file): print "not found rom file:", rom_file sys.exit(-1) self.ale.loadROM(rom_file) self.actions = self.ale.getMinimalActionSet() def get_actions_num(self): return len(self.actions) def act(self, action): reward = self.ale.act(self.actions[action]) return reward def get_screen_gray(self): return self.ale.getScreenGrayscale() def get_screen_rgb(self): return self.ale.getScreenRGB() def game_over(self): return self.ale.game_over() def reset_game(self): return self.ale.reset_game()
def get_random_baseline(gamepath): ale = ALEInterface() ale.setInt('random_seed', 42) recordings_dir = './recordings/breakout/' USE_SDL = True if USE_SDL: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX #ale.setString("record_screen_dir", recordings_dir); elif sys.platform.startswith('linux'): ale.setBool('sound', True) ale.setBool('display_screen', True) # Load the ROM file ale.loadROM(gamepath) # Get the list of legal actions legal_actions = ale.getLegalActionSet() # Play 5 episodes rewards = [] for episode in xrange(10): total_reward = 0 while not ale.game_over(): a = legal_actions[randrange(len(legal_actions))] reward = ale.act(a); total_reward += reward rewards.append(total_reward) #print 'Episode', episode, 'ended with score:', total_reward ale.reset_game() avg_reward = sum(rewards) / float(len(rewards)) return avg_reward
class Atari: def __init__(self,rom_name): self.ale = ALEInterface() self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode") self.ale.setInt("random_seed",123) self.ale.setInt("frame_skip",4) self.ale.loadROM('./' +rom_name) self.screen_width,self.screen_height = self.ale.getScreenDims() self.legal_actions = self.ale.getMinimalActionSet() self.action_map = dict() for i in range(len(self.legal_actions)): self.action_map[self.legal_actions[i]] = i #print len(self.legal_actions) self.windowname = rom_name #cv2.startWindowThread() #cv2.namedWindow(rom_name) def get_image(self): numpy_surface = np.zeros(self.screen_height*self.screen_width*3, dtype=np.uint8) self.ale.getScreenRGB(numpy_surface) image = np.reshape(numpy_surface, (self.screen_height, self.screen_width, 3)) return image def newGame(self): self.ale.reset_game() return self.get_image() def next(self, action): reward = self.ale.act(self.legal_actions[np.argmax(action)]) nextstate = self.get_image() #cv2.imshow(self.windowname,nextstate) if self.ale.game_over(): self.newGame() #print "reward %d" % reward return nextstate, reward, self.ale.game_over()
class AtariEnvironment: def __init__(self, args, outputDir): self.outputDir = outputDir self.screenCaptureFrequency = args.screen_capture_freq self.ale = ALEInterface() self.ale.setInt(b'random_seed', 123456) random.seed(123456) # Fix https://groups.google.com/forum/#!topic/deep-q-learning/p4FAIaabwlo self.ale.setFloat(b'repeat_action_probability', 0.0) # Load the ROM file self.ale.loadROM(args.rom) self.actionSet = self.ale.getMinimalActionSet() self.gameNumber = 0 self.stepNumber = 0 self.resetGame() def getNumActions(self): return len(self.actionSet) def getState(self): return self.state def getGameNumber(self): return self.gameNumber def getFrameNumber(self): return self.ale.getFrameNumber() def getEpisodeFrameNumber(self): return self.ale.getEpisodeFrameNumber() def getEpisodeStepNumber(self): return self.episodeStepNumber def getStepNumber(self): return self.stepNumber def getGameScore(self): return self.gameScore def isGameOver(self): return self.ale.game_over() def step(self, action): previousLives = self.ale.lives() reward = 0 isTerminal = 0 self.stepNumber += 1 self.episodeStepNumber += 1 for i in range(4): prevScreenRGB = self.ale.getScreenRGB() reward += self.ale.act(self.actionSet[action]) screenRGB = self.ale.getScreenRGB() # Detect end of episode, I don't think I'm handling this right in terms # of the overall game loop (??) if self.ale.lives() < previousLives or self.ale.game_over(): isTerminal = 1 break # if self.gameNumber % self.screenCaptureFrequency == 0: # dir = self.outputDir + '/screen_cap/game-%06d' % (self.gameNumber) # if not os.path.isdir(dir): # os.makedirs(dir) # self.ale.saveScreenPNG(dir + '/frame-%06d.png' % (self.getEpisodeFrameNumber())) maxedScreen = np.maximum(screenRGB, prevScreenRGB) self.state = self.state.stateByAddingScreen(maxedScreen, self.ale.getFrameNumber()) self.gameScore += reward return reward, self.state, isTerminal def resetGame(self): if self.ale.game_over(): self.gameNumber += 1 self.ale.reset_game() self.state = State().stateByAddingScreen(self.ale.getScreenRGB(), self.ale.getFrameNumber()) self.gameScore = 0 self.episodeStepNumber = 0 # environment steps vs ALE frames. Will probably be 4*frame number
class ALEEnvironment(BaseEnvironment): """ The :class:`MinimalGameHandler` class takes care of the interface to the ALE and tries to do nothing else. It's meant for advanced users who need fine control over every aspect of the process. It has many functions that are simply wrappers of the underlying ALE but with pythonic names/usage. Parameters ---------- rom : byte string Specifies the directory to load the rom from. Must be a byte string: b'dir_for_rom/rom.bin' display_screen : boolean Default False. Whether or not to show the game. True takes longer to run but can be fun to watch step_cap: int Default None. Maximum number of steps to run in an episode. Breakout can sometimes not return terminal even when game is ended. This fixes that and will return terminal after stepping above this count """ def __init__(self, rom, resize_shape=(84, 84), skip_frame=1, repeat_action_probability=0.0, step_cap=None, loss_of_life_termination=False, loss_of_life_negative_reward=False, grayscale=True, display_screen=False, seed=np.random.RandomState()): # set up emulator self.ale = ALEInterface() if display_screen: self.ale.setBool(b'display_screen', True) self.ale.setInt(b'frame_skip', skip_frame) self.ale.setInt(b'random_seed', seed.randint(0, 9999)) self.ale.setFloat(b'repeat_action_probability', repeat_action_probability) self.ale.setBool(b'color_averaging', False) self.ale.loadROM(rom.encode()) # setup gamescreen object. I think this is faster than recreating an empty each time width, height = self.ale.getScreenDims() channels = 1 if grayscale else 3 self.grayscale = grayscale self.gamescreen = np.empty((height, width, 1), dtype=np.uint8) self.resize_shape = resize_shape self.skip_frame = skip_frame self.step_cap = step_cap self.curr_step_count = 0 # setup action converter # ALE returns legal action indexes, convert these to just numbers self.action_inds = self.ale.getMinimalActionSet() # setup lives self.loss_of_life_negative_reward = loss_of_life_negative_reward self.cur_lives = self.ale.lives() self.loss_of_life_termination = loss_of_life_termination self.life_lost = False def reset(self): self.ale.reset_game() self.cur_lives = self.ale.lives() self.life_lost = False self.curr_step_count = 0 def step(self, action): self.curr_step_count += 1 ale_action = self.action_inds[action] return self._step(ale_action) def _step(self, ale_action): if not self.loss_of_life_termination and not self.loss_of_life_negative_reward: return self.ale.act(ale_action) else: rew = self.ale.act(ale_action) new_lives = self.ale.lives() if new_lives < self.cur_lives: # if loss of life is negative reward subtract 1 from reward if self.loss_of_life_negative_reward: rew -= 1 self.cur_lives = new_lives self.life_lost = True return rew def get_state(self): if self.grayscale: self.gamescreen = self.ale.getScreenGrayscale(self.gamescreen) else: self.gamescreen = self.ale.getScreenRGB(self.gamescreen) # if resize_shape is none then don't resize if self.resize_shape is not None: # if grayscale we remove the last dimmension (channel) if self.grayscale: processedImg = imresize(self.gamescreen[:, :, 0], self.resize_shape) else: processedImg = imresize(self.gamescreen, self.resize_shape) return processedImg def get_state_shape(self): return self.resize_shape def get_terminal(self): if self.loss_of_life_termination and self.life_lost: return True elif self.step_cap is not None and self.curr_step_count > self.step_cap: return True else: return self.ale.game_over() def get_num_actions(self): return len(self.action_inds)
class Emulator: def __init__(self, rom_path, rom_name, visualize, actor_id, rseed, single_life_episodes = False): self.ale = ALEInterface() self.ale.setInt("random_seed", rseed * (actor_id +1)) # For fuller control on explicit action repeat (>= ALE 0.5.0) self.ale.setFloat("repeat_action_probability", 0.0) # Disable frame_skip and color_averaging # See: http://is.gd/tYzVpj self.ale.setInt("frame_skip", 1) self.ale.setBool("color_averaging", False) self.ale.loadROM(rom_path + "/" + rom_name + ".bin") self.legal_actions = self.ale.getMinimalActionSet() self.screen_width,self.screen_height = self.ale.getScreenDims() #self.ale.setBool('display_screen', True) # Processed historcal frames that will be fed in to the network # (i.e., four 84x84 images) self.screen_images_processed = np.zeros((IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) self.rgb_screen = np.zeros((self.screen_height,self.screen_width, 3), dtype=np.uint8) self.gray_screen = np.zeros((self.screen_height,self.screen_width,1), dtype=np.uint8) self.frame_pool = np.empty((2, self.screen_height, self.screen_width)) self.current = 0 self.lives = self.ale.lives() self.visualize = visualize self.visualize_processed = False self.windowname = rom_name + ' ' + str(actor_id) if self.visualize: logger.debug("Opening emulator window...") #from skimage import io #io.use_plugin('qt') cv2.startWindowThread() cv2.namedWindow(self.windowname) logger.debug("Emulator window opened") if self.visualize_processed: logger.debug("Opening processed frame window...") cv2.startWindowThread() logger.debug("Processed frame window opened") cv2.namedWindow(self.windowname + "_processed") self.single_life_episodes = single_life_episodes def get_screen_image(self): """ Add screen (luminance) to frame pool """ # [screen_image, screen_image_rgb] = [self.ale.getScreenGrayscale(), # self.ale.getScreenRGB()] self.ale.getScreenGrayscale(self.gray_screen) self.ale.getScreenRGB(self.rgb_screen) self.frame_pool[self.current] = np.squeeze(self.gray_screen) self.current = (self.current + 1) % FRAMES_IN_POOL return self.rgb_screen def new_game(self): """ Restart game """ self.ale.reset_game() self.lives = self.ale.lives() if MAX_START_WAIT < 0: logger.debug("Cannot time travel yet.") sys.exit() elif MAX_START_WAIT > 0: wait = random.randint(0, MAX_START_WAIT) else: wait = 0 for _ in xrange(wait): self.ale.act(self.legal_actions[0]) def process_frame_pool(self): """ Preprocess frame pool """ img = None if BLEND_METHOD == "max_pool": img = np.amax(self.frame_pool, axis=0) #img resize(img[:210, :], (84, 84)) img = cv2.resize(img[:210, :], (84, 84), interpolation=cv2.INTER_LINEAR) img = img.astype(np.float32) img *= (1.0/255.0) return img # Reduce height to 210, if not so #cropped_img = img[:210, :] # Downsample to 110x84 #down_sampled_img = resize(cropped_img, (84, 84)) # Crop to 84x84 playing area #stackable_image = down_sampled_img[:, 26:110] #return stackable_image def action_repeat(self, a): """ Repeat action and grab screen into frame pool """ reward = 0 for i in xrange(ACTION_REPEAT): reward += self.ale.act(self.legal_actions[a]) new_screen_image_rgb = self.get_screen_image() return reward, new_screen_image_rgb def get_reshaped_state(self, state): return np.reshape(state, (1, IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) #return np.reshape(self.screen_images_processed, # (1, IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) def get_initial_state(self): """ Get the initial state """ self.new_game() for step in xrange(NR_IMAGES): reward, new_screen_image_rgb = self.action_repeat(0) self.screen_images_processed[:, :, step] = self.process_frame_pool() self.show_screen(new_screen_image_rgb) if self.is_terminal(): MAX_START_WAIT -= 1 return self.get_initial_state() return np.copy(self.screen_images_processed) #get_reshaped_state() def next(self, action): """ Get the next state, reward, and game over signal """ reward, new_screen_image_rgb = self.action_repeat(np.argmax(action)) self.screen_images_processed[:, :, 0:3] = \ self.screen_images_processed[:, :, 1:4] self.screen_images_processed[:, :, 3] = self.process_frame_pool() self.show_screen(new_screen_image_rgb) terminal = self.is_terminal() self.lives = self.ale.lives() return np.copy(self.screen_images_processed), reward, terminal #get_reshaped_state(), reward, terminal def show_screen(self, image): """ Show visuals for raw and processed images """ if self.visualize: #io.imshow(image[:210, :], fancy=True) cv2.imshow(self.windowname, image[:210, :]) if self.visualize_processed: #io.imshow(self.screen_images_processed[:, :, 3], fancy=True) cv2.imshow(self.windowname + "_processed", self.screen_images_processed[:, :, 3]) def is_terminal(self): if self.single_life_episodes: return (self.is_over() or (self.lives > self.ale.lives())) else: return self.is_over() def is_over(self): return self.ale.game_over()
def predict_move(self, state): q_values = self.model.predict(state.reshape(1, 84, 84, self.num_stacked_frames), batch_size=1) optimal_policy = np.argmax(q_values) # Index of the best Q Value if np.random.random() < self.epsilon: optimal_policy = np.random.choice(self.valid_action_set) # Translate the index of the highest Q value action to an action # in the games set. This is required as the games action set may look # like this: [0, 1, 2, 3, 6, 17, 18] optimal_action = self.valid_action_set[optimal_policy] return optimal_action, q_values[0, optimal_policy] try: for episode in range(10): total_reward = 0 while not ale.game_over(): a = specific_actions[randrange(len(specific_actions))] reward = ale.act(a) total_reward += reward print('Episode ended with score:', total_reward) ale.reset_game() except KeyboardInterrupt: print('Shutting Down')
class ArcadeLearningEnvironment(Environment): """ [Arcade Learning Environment](https://github.com/mgbellemare/Arcade-Learning-Environment) adapter (specification key: `ale`, `arcade_learning_environment`). May require: ```bash sudo apt-get install libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake git clone https://github.com/mgbellemare/Arcade-Learning-Environment.git cd Arcade-Learning-Environment mkdir build && cd build cmake -DUSE_SDL=ON -DUSE_RLGLUE=OFF -DBUILD_EXAMPLES=ON .. make -j 4 cd .. pip3 install . ``` Args: level (string): ALE rom file (<span style="color:#C00000"><b>required</b></span>). loss_of_life_termination: Signals a terminal state on loss of life (<span style="color:#00C000"><b>default</b></span>: false). loss_of_life_reward (float): Reward/Penalty on loss of life (negative values are a penalty) (<span style="color:#00C000"><b>default</b></span>: 0.0). repeat_action_probability (float): Repeats last action with given probability (<span style="color:#00C000"><b>default</b></span>: 0.0). visualize (bool): Whether to visualize interaction (<span style="color:#00C000"><b>default</b></span>: false). frame_skip (int > 0): Number of times to repeat an action without observing (<span style="color:#00C000"><b>default</b></span>: 1). seed (int): Random seed (<span style="color:#00C000"><b>default</b></span>: none). """ def __init__(self, level, life_loss_terminal=False, life_loss_punishment=0.0, repeat_action_probability=0.0, visualize=False, frame_skip=1, seed=None): super().__init__() from ale_python_interface import ALEInterface self.environment = ALEInterface() self.rom_file = level self.life_loss_terminal = life_loss_terminal self.life_loss_punishment = life_loss_punishment self.environment.setFloat(b'repeat_action_probability', repeat_action_probability) self.environment.setBool(b'display_screen', visualize) self.environment.setInt(b'frame_skip', frame_skip) if seed is not None: self.environment.setInt(b'random_seed', seed) # All set commands must be done before loading the ROM. self.environment.loadROM(rom_file=self.rom_file.encode()) self.available_actions = tuple(self.environment.getLegalActionSet()) # Full list of actions: # No-Op, Fire, Up, Right, Left, Down, Up Right, Up Left, Down Right, Down Left, Up Fire, # Right Fire, Left Fire, Down Fire, Up Right Fire, Up Left Fire, Down Right Fire, Down Left # Fire def __str__(self): return super().__str__() + '({})'.format(self.rom_file) def states(self): width, height = self.environment.getScreenDims() return dict(type='float', shape=(height, width, 3)) def actions(self): return dict(type='int', num_values=len(self.available_actions)) def close(self): self.environment.__del__() self.environment = None def get_states(self): screen = np.copy( self.environment.getScreenRGB(screen_data=self.screen)) screen = screen.astype(dtype=np.float32) / 255.0 return screen def reset(self): self.environment.reset_game() width, height = self.environment.getScreenDims() self.screen = np.empty((height, width, 3), dtype=np.uint8) self.lives = self.environment.lives() return self.get_states() def execute(self, actions): reward = self.environment.act(action=self.available_actions[actions]) terminal = self.environment.game_over() states = self.get_states() next_lives = self.environment.lives() if next_lives < self.lives: if self.life_loss_terminal: terminal = True elif self.life_loss_punishment > 0.0: reward -= self.life_loss_punishment self.lives = next_lives return states, terminal, reward
def train_agent(gamepath, agent, n_episodes, display_screen, record_weights, reduce_exploration_prob_amount, n_frames_to_skip): """ :description: trains an agent to play a game :type gamepath: string :param gamepath: path to the binary of the game to be played :type agent: subclass RLAlgorithm :param agent: the algorithm/agent that learns to play the game :type n_episodes: int :param n_episodes: number of episodes of the game on which to train """ # load the ale interface to interact with ale = ALEInterface() ale.setInt('random_seed', 42) # display/recording settings, doesn't seem to work currently recordings_dir = './recordings/breakout/' # previously "USE_SDL" if display_screen: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX #ale.setString("record_screen_dir", recordings_dir); elif sys.platform.startswith('linux'): ale.setBool('sound', True) ale.setBool('display_screen', True) ale.loadROM(gamepath) ale.setInt("frame_skip", n_frames_to_skip) screen_preprocessor = screen_utils.RGBScreenPreprocessor() rewards = [] best_reward = 0 print('starting training...') for episode in xrange(n_episodes): action = 0 reward = 0 newAction = None total_reward = 0 counter = 0 lives = ale.lives() screen = np.zeros((32, 32, 3), dtype=np.int8) state = { "screen": screen, "objects": None, "prev_objects": None, "prev_action": 0, "action": 0 } while not ale.game_over(): # if newAction is None then we are training an off-policy algorithm # otherwise, we are training an on policy algorithm if newAction is None: action = agent.getAction(state) else: action = newAction reward += ale.act(action) if ale.lives() < lives: lives = ale.lives() reward -= 1 total_reward += reward new_screen = ale.getScreenRGB() new_screen = screen_preprocessor.preprocess(new_screen) new_state = { "screen": new_screen, "objects": None, "prev_objects": state["objects"], "prev_action": state["action"], "action": action } newAction = agent.incorporateFeedback(state, action, reward, new_state) state = new_state reward = 0 rewards.append(total_reward) if total_reward > best_reward and record_weights: best_reward = total_reward print("Best reward: {}".format(total_reward)) if episode % PRINT_TRAINING_INFO_PERIOD == 0: print '\n############################' print '### training information ###' print("Average reward: {}".format(np.mean(rewards))) print("Last 50: {}".format( np.mean(rewards[-NUM_EPISODES_AVERAGE_REWARD_OVER:]))) print("Exploration probability: {}".format(agent.explorationProb)) print('action: {}'.format(action)) print('size of weights dict: {}'.format(len(agent.weights))) print('current objects: {}'.format(state['objects'])) print('previous objects: {}'.format(state['prev_objects'])) avg_feat_weight = np.mean( [v for k, v in agent.weights.iteritems()]) print('average feature weight: {}'.format(avg_feat_weight)) print '############################' print '############################\n' if episode != 0 and episode % RECORD_WEIGHTS_PERIOD == 0 and record_weights: file_utils.save_rewards(rewards, filename='episode-{}-{}-rewards'.format( episode, type(agent).__name__)) file_utils.save_weights(agent.weights, filename='episode-{}-{}-weights'.format( episode, type(agent).__name__)) if agent.explorationProb > MINIMUM_EXPLORATION_EPSILON: agent.explorationProb -= reduce_exploration_prob_amount print('episode: {} ended with score: {}'.format(episode, total_reward)) ale.reset_game() return rewards
class Agent(object): def __init__(self): self._ale = ALEInterface() self._ale.setInt('random_seed', 123) self._ale.setFloat('repeat_action_probability', 0.0) self._ale.setBool('color_averaging', False) self._ale.loadROM('roms/enduro.bin') self._controller = Controller(self._ale) self._extractor = StateExtractor(self._ale) self._image = None self.curr_action = 0 def run(self, learn, episodes=1, draw=False): """ Implements the playing/learning loop. Args: learn(bool): Whether the self.learn() function should be called. episodes (int): The number of episodes to run the agent for. draw (bool): Whether to overlay the environment state on the frame. Returns: None """ if learn: self.init_Q() action = random.choice( self.getActionsSet()) # init_action for q_learning for e in range(episodes): # Observe the environment to set the initial state (grid, self._image) = self._extractor.run(draw=draw, scale=4.0) self.initialise(grid) num_frames = self._ale.getFrameNumber() # Each episode lasts 6500 frames while self._ale.getFrameNumber() - num_frames < 6500: # Take an action self.act(action) # Update the environment grid s_grid = grid (grid, self._image) = self._extractor.run(draw=draw, scale=4.0) self.sense(grid) s_next_grid = grid # Perform learning if required if learn: # self.learn(s_grid,s_next_grid) # for q learning action = self.learn(s_grid, s_next_grid, action) self.callback(learn, e + 1, self._ale.getFrameNumber() - num_frames) self.end_state(e) self._ale.reset_game() def getActionsSet(self): """ Returns the set of all possible actions """ return [Action.ACCELERATE, Action.RIGHT, Action.LEFT, Action.BREAK] def move(self, action): """ Executes the action and advances the game to the next state. Args: action (int): The action which should executed. Make sure to use the constants returned by self.getActionsSet() Returns: int: The obtained reward after executing the action """ return self._controller.move(action) def initialise(self, grid): """ Called at the beginning of each episode, mainly used for state initialisation. Args: grid (np.ndarray): 11x10 array with the initial environment grid. Returns: None """ raise NotImplementedError def act(self): """ Called at each loop iteration to choose and execute an action. Returns: None """ raise NotImplementedError def sense(self, grid): """ Called at each loop iteration to construct the new state from the update environment grid. Returns: None """ raise NotImplementedError def learn(self): """ Called at each loop iteration when the agent is learning. It should implement the learning procedure. Returns: None """ raise NotImplementedError def callback(self, learn, episode, iteration): """ Called at each loop iteration mainly for reporting purposes. Args: learn (bool): Indicates whether the agent is learning or not. episode (int): The number of the current episode. iteration (int): The number of the current iteration. Returns: None """ def get_surround(): raise NotImplementedError
class AtariPlayer(RLEnvironment): """ A wrapper for atari emulator. Will automatically restart when a real episode ends (isOver might be just lost of lives but not game over). """ def __init__(self, rom_file, viz=0, height_range=(None, None), frame_skip=4, image_shape=(84, 84), nullop_start=30, live_lost_as_eoe=True, env_name="Boxing-v0"): """ :param rom_file: path to the rom :param frame_skip: skip every k frames and repeat the action :param image_shape: (w, h) :param height_range: (h1, h2) to cut :param viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. :param nullop_start: start with random number of null ops :param live_losts_as_eoe: consider lost of lives as end of episode. useful for training. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_path('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Warning) except AttributeError: if execute_only_once(): logger.warn("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 30000)) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.current_episode_score = StatCounter() self.restart_episode() # re def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def current_state(self): """ :returns: a gray-scale (h, w, 1) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret[self.height_range[0]:self.height_range[1], :].astype( 'float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) ret = np.expand_dims(ret, axis=2) return ret.astype('uint8') # to save some memory def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def finish_episode(self): self.stats['score'].append(self.current_episode_score.sum) def restart_episode(self): self.current_episode_score.reset() with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives if isOver: self.finish_episode() if self.ale.game_over(): self.restart_episode() return (r, isOver)
class AtariPlayer(RLEnvironment): """ A wrapper for atari emulator. NOTE: will automatically restart when a real episode ends """ def __init__(self, rom_file, viz=0, height_range=(None, None), frame_skip=4, image_shape=(84, 84), nullop_start=30, live_lost_as_eoe=True): """ :param rom_file: path to the rom :param frame_skip: skip every k frames and repeat the action :param image_shape: (w, h) :param height_range: (h1, h2) to cut :param viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. :param nullop_start: start with random number of null ops :param live_losts_as_eoe: consider lost of lives as end of episode. useful for training. """ super(AtariPlayer, self).__init__() self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt("random_seed", self.rng.randint(0, 10000)) self.ale.setBool("showinfo", False) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Warning) except AttributeError: log_once() self.ale.setInt("frame_skip", 1) self.ale.setBool('color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat('repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString('record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.current_episode_score = StatCounter() self.restart_episode() def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def current_state(self): """ :returns: a gray-scale (h, w, 1) image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): #m = cv2.resize(ret, (1920,1200)) cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret[self.height_range[0]:self.height_range[1], :] # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) ret = np.expand_dims(ret, axis=2) return ret def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def restart_episode(self): if self.current_episode_score.count > 0: self.stats['score'].append(self.current_episode_score.sum) self.current_episode_score.reset() self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) isOver = self.ale.game_over() if isOver: self.restart_episode() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives return (r, isOver) def get_stat(self): try: return { 'avg_score': np.mean(self.stats['score']), 'max_score': float(np.max(self.stats['score'])) } except ValueError: return {}
class ALEEnvironment(BaseEnvironment): """ A wrapper of Arcade Learning Environment, which inherits all members of ``BaseEnvironment``. """ # 63 games ADVENTURE = "adventure" AIR_RAID = "air_raid" ALIEN = "alien" AMIDAR = "amidar" ASSAULT = "assault" ASTERIX = "asterix" ASTEROIDS = "asteroids" ATLANTIS = "aslantis" BANK_HEIST = "bank_heist" BATTLE_ZONE = "battle_zone" BEAM_RIDER = "beam_rider" BERZERK = "berzerk" BOWLING = "bowling" BOXING = "boxing" BREAKOUT = "breakout" CARNIVAL = "carnival" CENTIPEDE = "centipede" CHOPPER_COMMAND = "chopper_command" CRAZY_CLIMBER = "crazy_climber" DEFENDER = "defender" DEMON_ATTACK = "demon_attack" DOUBLE_DUNK = "double_dunk" ELEVATOR_ACTION = "elevator_action" ENDURO = "enduro" FISHING_DERBY = "fishing_derby" FREEWAY = "freeway" FROSTBITE = "frostbite" GOPHER = "gopher" GRAVITAR = "gravitar" HERO = "hero" ICE_HOCKEY = "ice_hockey" JAMESBOND = "jamesbond" JOURNEY_ESCAPE = "journey_escape" KABOOM = "kaboom" KANGAROO = "kangaroo" KRULL = "krull" KUNGFU_MASTER = "kung_fu_master" MONTEZUMA = "montezuma_revenge" MS_PACMAN = "ms_pacman" UNKNOWN = "name_this_game" PHOENIX = "phoenix" PITFALL = "pitfall" PONG = "pong" POOYAN = "pooyan" PRIVATE_EYE = "private_eye" QBERT = "qbert" RIVERRAID = "riverraid" ROAD_RUNNER = "road_runner" ROBOTANK = "robotank" SEAQUEST = "seaquest" SKIING = "skiing" SOLARIS = "solaris" SPACE_INVADERS = "space_invaders" STAR_GUNNER = "star_gunner" TENNIS = "tennis" TIME_PILOT = "time_pilot" TUTANKHAM = "tutankham" UP_N_DOWN = "up_n_down" VENTURE = "venture" VIDEO_PINBALL = "video_pinball" WIZARD_OF_WOR = "wizard_of_wor" YARS_REVENGE = "yars_revenge" ZAXXON = "zaxxon" def __init__(self, rom_name, frame_skip=4, repeat_action_probability=0., max_episode_steps=10000, loss_of_life_termination=False, loss_of_life_negative_reward=False, bitwise_max_on_two_consecutive_frames=False, is_render=False, seed=None, startup_policy=None, disable_actions=None, num_of_sub_actions=-1, state_processor=AtariProcessor(resize_shape=(84, 84), convert_to_grayscale=True)): os.environ['SDL_VIDEO_CENTERED'] = '1' file_exist = isfile(ALEEnvironment.get_rom_path(rom_name)) if not file_exist: raise ValueError("Rom not found ! Please put rom " + rom_name + ".bin into: " + ALEEnvironment.get_rom_path()) self.__rom_name = rom_name self.__ale = ALEInterface() if frame_skip < 0: print("Invalid frame_skip param ! Set default frame_skip = 4") self.__frame_skip = 4 else: self.__frame_skip = frame_skip if repeat_action_probability < 0 or repeat_action_probability > 1: raise ValueError("Invalid repeat_action_probability") else: self.__repeat_action_probability = repeat_action_probability self.__max_episode_steps = max_episode_steps self.__loss_of_life_termination = loss_of_life_termination self.__loss_of_life_negative_reward = loss_of_life_negative_reward self.__max_2_frames = bitwise_max_on_two_consecutive_frames # Max 2 frames only work with grayscale self.__grayscale = False if state_processor is not None and type( state_processor ) is AtariProcessor and state_processor.get_grayscale(): self.__grayscale = True if self.__max_2_frames and self.__frame_skip > 1 and self.__grayscale: self.__max_2_frames = True else: self.__max_2_frames = False self.__is_render = is_render self.__processor = state_processor if seed is None or seed <= 0 or seed >= 9999: if seed is not None and (seed < 0 or seed >= 9999): print("Invalid seed ! Default seed = randint(0, 9999") self.__seed = np.random.randint(0, 9999) self.__random_seed = True else: self.__random_seed = False self.__seed = seed self.__current_steps = 0 self.__is_life_lost = False self.__is_terminal = False self.__current_lives = 0 self.__action_reduction = num_of_sub_actions self.__scr_width, self.__scr_height, self.__action_set = self.__init_ale( ) self.__prev_buffer = np.empty((self.__scr_height, self.__scr_width, 3), dtype=np.uint8) self.__current_buffer = np.empty( (self.__scr_height, self.__scr_width, 3), dtype=np.uint8) self.__current_state = None self.__prev_state = None self.__startup_policy = startup_policy if disable_actions is None: self.__dis_act = [] else: self.__dis_act = disable_actions if self.__processor.get_number_of_objectives() > 1: self.__multi_objs = True else: self.__multi_objs = False def get_processor(self): return self.__processor def __init_ale(self): self.__ale.setBool(b'display_screen', self.__is_render) if self.__max_2_frames and self.__frame_skip > 1: self.__ale.setInt(b'frame_skip', 1) else: self.__ale.setInt(b'frame_skip', self.__frame_skip) self.__ale.setInt(b'random_seed', self.__seed) self.__ale.setFloat(b'repeat_action_probability', self.__repeat_action_probability) self.__ale.setBool(b'color_averaging', False) self.__ale.loadROM( ALEEnvironment.get_rom_path(self.__rom_name).encode()) width, height = self.__ale.getScreenDims() return width, height, self.__ale.getMinimalActionSet() def clone(self): if self.__random_seed: seed = np.random.randint(0, 9999) else: seed = self.__seed return ALEEnvironment(self.__rom_name, self.__frame_skip, self.__repeat_action_probability, self.__max_episode_steps, self.__loss_of_life_termination, self.__loss_of_life_negative_reward, self.__max_2_frames, self.__is_render, seed, self.__startup_policy, self.__dis_act, self.__action_reduction, self.__processor.clone()) def step_all(self, a): if isinstance(a, (list, np.ndarray)): if len(a) <= 0: raise ValueError('Empty action list !') a = a[0] self.__current_steps += 1 act = self.__action_set[a] rew = self._step(act) next_state = self.get_state() _is_terminal = self.is_terminal() return next_state, rew, _is_terminal, self.__current_steps def reset(self): self.__ale.reset_game() self.__current_lives = self.__ale.lives() self.__is_life_lost = False self.__is_terminal = False self.__current_state = None self.__prev_state = None action_space = self.get_action_space() v_range, is_range = action_space.get_range() if len(v_range) > 1: self.step(1) # No op steps if self.__startup_policy is not None: max_steps = int(self.__startup_policy.get_max_steps()) for _ in range(max_steps): act = self.__startup_policy.step(self.get_state(), action_space) self.step(act) # Start training from this point self.__current_steps = 0 # Reset processor self.__processor.reset() return self.get_state() def _pre_step(self, act): if self.__max_2_frames and self.__frame_skip > 1: rew = 0 for i in range(self.__frame_skip - 2): rew += self.__ale.act(act) self.__prev_buffer = self.__ale.getScreenRGB( self.__prev_buffer) self.__prev_buffer = self.__ale.getScreenRGB(self.__prev_buffer) rew += self.__ale.act(act) self.__current_buffer = self.__ale.getScreenRGB( self.__current_buffer) self.__is_terminal = self.__ale.game_over() self.__prev_state = self.__processor.process(self.__prev_buffer) self.__current_state = self.__processor.process( self.__current_buffer) self.__current_state = np.maximum.reduce( [self.__prev_state, self.__current_state]) else: rew = self.__ale.act(act) self.__current_buffer = self.__ale.getScreenRGB( self.__current_buffer) self.__is_terminal = self.__ale.game_over() if self.__processor is not None: self.__current_state = self.__processor.process( self.__current_buffer) if self.__multi_objs and self.__processor is not None: all_rewards = self.__processor.get_rewards(rew) return all_rewards else: return rew def _step(self, act): for i in range(len(self.__dis_act)): if act == self.__dis_act[i]: act = 0 if not self.__loss_of_life_termination and not self.__loss_of_life_negative_reward: if not self.__is_terminal: next_lives = self.__ale.lives() if next_lives < self.__current_lives: act = 1 self.__current_lives = next_lives return self._pre_step(act) else: rew = self._pre_step(act) next_lives = self.__ale.lives() if next_lives < self.__current_lives: if self.__loss_of_life_negative_reward: rew -= 1 self.__current_lives = next_lives self.__is_life_lost = True return rew def get_state(self): if not self.__max_2_frames: if self.__processor is not None: return self.__current_state else: return self.__current_buffer else: return self.__current_state def is_terminal(self): if self.__loss_of_life_termination and self.__is_life_lost: return True elif self.__max_episode_steps is not None and self.__current_steps > self.__max_episode_steps: return True else: return self.__is_terminal @staticmethod def get_rom_path(rom=None): if rom is None: return os.path.dirname(os.path.abspath(__file__)) + "/roms/" else: return os.path.dirname( os.path.abspath(__file__)) + "/roms/" + rom + ".bin" @staticmethod def list_all_roms(): return [ f for f in listdir(ALEEnvironment.get_rom_path()) if isfile(join(ALEEnvironment.get_rom_path(), f)) ] def get_state_space(self): if self.__processor is None: shape = self.__current_buffer.shape else: shape = self.__processor.process(self.__current_buffer).shape min_value = np.zeros(shape, dtype=np.uint8) max_value = np.full(shape, 255) return Space(min_value, max_value, True) def get_action_space(self): if self.__action_reduction >= 1: return Space(0, self.__action_reduction - 1, True) else: return Space(0, len(self.__action_set) - 1, True) def step(self, act): if isinstance(act, (list, np.ndarray)): if len(act) <= 0: raise ValueError('Empty action list !') act = act[0] self.__current_steps += 1 act = self.__action_set[act] rew = self._step(act) return rew def get_current_steps(self): return self.__current_steps def is_atari(self): return True def is_render(self): return self.__is_render def get_number_of_objectives(self): if self.__processor is None: return 1 else: return self.__processor.get_number_of_objectives() def get_number_of_agents(self): if self.__processor is None: return 1 else: return self.__processor.get_number_of_agents() def get_state_processor(self): return self.__processor
class AtariEmulator(BaseEnvironment): def __init__(self, actor_id, args): self.ale = ALEInterface() self.ale.setInt(b"random_seed", args.random_seed * (actor_id + 1)) # For fuller control on explicit action repeat (>= ALE 0.5.0) self.ale.setFloat(b"repeat_action_probability", 0.0) # Disable frame_skip and color_averaging # See: http://is.gd/tYzVpj self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b"color_averaging", False) full_rom_path = args.rom_path + "/" + args.game + ".bin" self.ale.loadROM(str.encode(full_rom_path)) self.legal_actions = self.ale.getMinimalActionSet() self.screen_width, self.screen_height = self.ale.getScreenDims() self.lives = self.ale.lives() self.random_start = args.random_start self.single_life_episodes = args.single_life_episodes self.call_on_new_frame = args.visualize # Processed historcal frames that will be fed in to the network # (i.e., four 84x84 images) self.observation_pool = ObservationPool(np.zeros((IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES), dtype=np.uint8)) self.rgb_screen = np.zeros((self.screen_height, self.screen_width, 3), dtype=np.uint8) self.gray_screen = np.zeros((self.screen_height, self.screen_width, 1), dtype=np.uint8) self.frame_pool = FramePool(np.empty((2, self.screen_height, self.screen_width), dtype=np.uint8), self.__process_frame_pool) def get_legal_actions(self): return self.legal_actions def __get_screen_image(self): """ Get the current frame luminance :return: the current frame """ self.ale.getScreenGrayscale(self.gray_screen) if self.call_on_new_frame: self.ale.getScreenRGB(self.rgb_screen) self.on_new_frame(self.rgb_screen) return np.squeeze(self.gray_screen) def on_new_frame(self, frame): pass def __new_game(self): """ Restart game """ self.ale.reset_game() self.lives = self.ale.lives() if self.random_start: wait = random.randint(0, MAX_START_WAIT) for _ in range(wait): self.ale.act(self.legal_actions[0]) def __process_frame_pool(self, frame_pool): """ Preprocess frame pool """ img = np.amax(frame_pool, axis=0) img = imresize(img, (84, 84), interp='nearest') img = img.astype(np.uint8) return img def __action_repeat(self, a, times=ACTION_REPEAT): """ Repeat action and grab screen into frame pool """ reward = 0 for i in range(times - FRAMES_IN_POOL): reward += self.ale.act(self.legal_actions[a]) # Only need to add the last FRAMES_IN_POOL frames to the frame pool for i in range(FRAMES_IN_POOL): reward += self.ale.act(self.legal_actions[a]) self.frame_pool.new_frame(self.__get_screen_image()) return reward def get_initial_state(self): """ Get the initial state """ self.__new_game() for step in range(NR_IMAGES): _ = self.__action_repeat(0) self.observation_pool.new_observation(self.frame_pool.get_processed_frame()) if self.__is_terminal(): raise Exception('This should never happen.') return self.observation_pool.get_pooled_observations() def next(self, action): """ Get the next state, reward, and game over signal """ reward = self.__action_repeat(np.argmax(action)) self.observation_pool.new_observation(self.frame_pool.get_processed_frame()) terminal = self.__is_terminal() self.lives = self.ale.lives() observation = self.observation_pool.get_pooled_observations() return observation, reward, terminal def __is_terminal(self): if self.single_life_episodes: return self.__is_over() or (self.lives > self.ale.lives()) else: return self.__is_over() def __is_over(self): return self.ale.game_over() def get_noop(self): return [1.0, 0.0]
class AleEnv(): def __init__(self, rom, display_screen, use_env_frame_skip, frame_repeat): self.actions = None self.rom = rom self.display_screen = display_screen self.use_env_frame_skip = use_env_frame_skip self.frame_repeat = frame_repeat def initialize(self): self.ale = ALEInterface() self.ale.setInt("random_seed", random.randint(1, 1000)) if self.display_screen: self.ale.setBool('display_screen', True) if self.use_env_frame_skip == True: self.ale.setInt('frame_skip', self.frame_repeat) self.ale.setBool('color_averaging', True) self.ale.setFloat('repeat_action_probability', 0) self.ale.loadROM(self.rom) self.actions = self.ale.getMinimalActionSet() print 'actions: %s' % self.actions (self.screen_width,self.screen_height) = self.ale.getScreenDims() print("width/height: " +str(self.screen_width) + "/" + str(self.screen_height)) self.initialized = True def get_actions(self, rom=None): if self.actions is None and rom != None: ale = ALEInterface() ale.loadROM(rom) self.actions = ale.getMinimalActionSet() return self.actions @property def state_dtype(self): return np.uint8 @property def continuous_action(self): return False def reset_game(self): self.ale.reset_game() def lives(self): return self.ale.lives() def getScreenRGB(self): return self.ale.getScreenRGB() def getState(self, debug_display=False, debug_input=None): screen = self.ale.getScreenGrayscale() if screen is not None and debug_display: debug_input.show(screen.reshape(screen.shape[0], screen.shape[1])) return screen.reshape(self.screen_height, self.screen_width) def act(self, action): return self.ale.act(action) def game_over(self): return self.ale.game_over() def finish(self): return
class AleAgent: ## # @param processing_cls Class for processing game visual unput def __init__(self, processing_cls, game_rom=None, encoder_model=None, encoder_weights=None, NFQ_model=None, NFQ_weights=None): assert game_rom is not None self.game = ALEInterface() if encoder_weights is not None and encoder_model is not None: self.encoder = Encoder(path_to_model=encoder_model, path_to_weights=encoder_weights) else: self.encoder = Encoder() self.processor = processing_cls() # Get & Set the desired settings self.game.setInt('random_seed', 0) self.game.setInt('frame_skip', 4) # Set USE_SDL to true to display the screen. ALE must be compilied # with SDL enabled for this to work. On OSX, pygame init is used to # proxy-call SDL_main. USE_SDL = True if USE_SDL: if sys.platform == 'darwin': pygame.init() self.game.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.game.setBool('sound', False) # no sound self.game.setBool('display_screen', True) # Load the ROM file self.game.loadROM(game_rom) # Get the list of legal actions self.legal_actions = self.game.getLegalActionSet() # Get actions applicable in current game self.minimal_actions = self.game.getMinimalActionSet() if NFQ_model is not None and NFQ_weights is not None: self.NFQ = NFQ( self.encoder.out_dim, len(self.minimal_actions), model_path=NFQ_model, weights_path=NFQ_weights ) else: self.NFQ = NFQ(self.encoder.out_dim, len(self.minimal_actions)) (self.screen_width, self.screen_height) = self.game.getScreenDims() self.screen_data = np.zeros( (self.screen_height, self.screen_width), dtype=np.uint8 ) ## # Initialize the reinforcement learning def train(self, num_of_episodes=1500, eps=0.995, key_binding=None): pygame.init() for episode in xrange(num_of_episodes): total_reward = 0 moves = 0 hits = 0 print 'Starting episode: ', episode+1 if key_binding: eps = 0.05 else: eps -= 2/num_of_episodes self.game.getScreenGrayscale(self.screen_data) pooled_data = self.processor.process(self.screen_data) next_state = self.encoder.encode(pooled_data) while not self.game.game_over(): current_state = next_state x = None if key_binding: key_pressed = pygame.key.get_pressed() x = key_binding(key_pressed) if x is None: r = np.random.rand() if r < eps: x = np.random.randint(self.minimal_actions.size) else: x = self.NFQ.predict_action(current_state) a = self.minimal_actions[x] # Apply an action and get the resulting reward reward = self.game.act(a) # record only every 3 frames # if not moves % 3: self.game.getScreenGrayscale(self.screen_data) pooled_data = self.processor.process(self.screen_data) next_state = self.encoder.encode(pooled_data) transition = np.append(current_state, x) transition = np.append(transition, next_state) transition = np.append(transition, reward) self.NFQ.add_transition(transition) total_reward += reward if reward > 0: hits += 1 moves += 1 if eps > 0.1: eps -= 0.00001 # end while print 'Epsilon: ', eps print 'Episode', episode+1, 'ended with score:', total_reward print 'Hits: ', hits self.game.reset_game() self.NFQ.train() hits = 0 moves = 0 self.NFQ.save_net() # end for ## # Play the game! def play(self): total_reward = 0 moves = 1 while not self.game.game_over(): self.game.getScreenGrayscale(self.screen_data) pooled_data = self.processor.process(self.screen_data) current_state = self.encoder.encode(pooled_data) x = self.NFQ.predict_action(current_state) a = self.minimal_actions[x] reward = self.game.act(a) total_reward += reward moves += 1 print 'The game ended with score:', total_reward, ' after: ', moves, ' moves'
class ALEEnvironment(Environment): def __init__(self, rom_file, args): from ale_python_interface import ALEInterface self.ale = ALEInterface() # Set ALE configuration self.ale.setInt(b'frame_skip', args.frame_skip) self.ale.setFloat(b'repeat_action_probability', args.repeat_action_probability) self.ale.setBool(b'color_averaging', args.color_averaging) if args.random_seed: self.ale.setInt(b'random_seed', args.random_seed) if args.record_screen_path: if not os.path.exists(args.record_screen_path): os.makedirs(args.record_screen_path) self.ale.setString(b'record_screen_dir', args.record_screen_path.encode()) if args.record_sound_filename: self.ale.setBool(b'sound', True) self.ale.setString(b'record_sound_filename', args.record_sound_filename.encode()) # Load ROM self.ale.loadROM(rom_file.encode()) # Set game difficulty and mode (after loading) self.ale.setDifficulty(args.game_difficulty) self.ale.setMode(args.game_mode) # Whether to use minimum set or set if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() else: self.actions = self.ale.getLegalActionSet() # Life lost control self.life_lost = False # Initialize base class super(ALEEnvironment, self).__init__(args) def action_dim(self): return len(self.actions) def reset(self): # In test mode, the game is simply initialized. In train mode, if the game # is in terminal state due to a life loss but not yet game over, then only # life loss flag is reset so that the next game starts from the current # state. Otherwise, the game is simply initialized. if (self.mode == 'test' or not self.life_lost or # `reset` called in a middle of episode self.ale.game_over() # all lives are lost ): self.ale.reset_game() self.life_lost = False screen = self._get_state(self.ale.getScreenRGB()) return screen def step(self, action, action_b=0, ignore_screen=False): lives = self.ale.lives() # Act on environment reward = self.ale.act(self.actions[action], self.actions[action_b] + 18) # Check if life was lost self.life_lost = (not lives == self.ale.lives()) # Check terminal state terminal = (self.ale.game_over() or self.life_lost ) if self.mode == 'train' else self.ale.game_over() # Check if should ignore the screen (in case of RobotEnvironment) if ignore_screen: screen = None else: # Get screen from ALE screen = self._get_state(self.ale.getScreenRGB()) # Wait for next frame to start self.fps_control.wait_next_frame() return screen, reward, terminal
class GameManager(object): """This class takes care of the interactions between an agent and a game across episodes, as well as overall logging of performance. """ def __init__( self, game_name, agent, results_dir, n_epochs=1, n_episodes=None, n_frames=None, remove_old_results_dir=False, use_minimal_action_set=True, min_time_between_frames=0, ): """game_name is one of the supported games (there are many), as a string: "space_invaders.bin" agent is an an instance of a subclass of the Agent interface results_dir is a string representing a directory in which results and logs are placed If it does not exist, it is created. use_minimal_action_set determines whether the agent is offered all possible actions, or only those (minimal) that are applicable to the specific game. min_time_between_frames is the minimum required time in seconds between frames. If 0, the game is unrestricted. """ self.game_name = game_name self.agent = agent self.use_minimal_action_set = use_minimal_action_set self.min_time_between_frames = min_time_between_frames self.n_epochs = n_epochs self.n_episodes = n_episodes self.n_frames = n_frames if (n_episodes is None and n_frames is None) or (n_episodes is not None and n_frames is not None): raise ValueError("Extacly one of n_episodes and n_frames " "must be defined") self.initialize_results_dir(results_dir, remove_old_results_dir) self.log = util.logging.Logger( ("settings", "step", "episode", "epoch", "overall"), "settings", os.path.join(self.results_dir, "GameManager.log"), ) self.stats = util.logging.CSVLogger( os.path.join(self.results_dir, "stats.log"), header="epoch,episode,total_reward,n_frames,wall_time", print_items=True, ) self._object_cache = dict() self.initialize_ale() self.initialize_agent() self.dump_settings() def initialize_results_dir(self, results_dir, remove_existing=False): """Creates the whole path of directories if they do no exist. If they do exist, raises an error unless remove_existing is True, in which case the existing directory is deleted. """ now = datetime.now().strftime("%Y%m%d-%H-%M") # drop .bin, append current time down to the minute results_dir = os.path.join(results_dir, self.game_name[:-4] + now) if remove_existing: if os.path.exists(results_dir): shutil.rmtree(results_dir) # Should raise an error if directory exists os.makedirs(results_dir) self.results_dir = results_dir def initialize_ale(self): self.ale = ALEInterface() self.ale.loadROM(os.path.join(ROM_RELATIVE_LOCATION, self.game_name)) def initialize_agent(self): RSC = namedtuple("RawStateCallbacks", ["raw", "grey", "rgb", "ram"]) raw_state_callbacks = RSC(self.get_screen, self.get_screen_grayscale, self.get_screen_RGB, self.get_RAM) self.agent.set_raw_state_callbacks(raw_state_callbacks) self.agent.set_results_dir(self.results_dir) if self.use_minimal_action_set: actions = self.ale.getMinimalActionSet() else: actions = self.ale.getLegalActionSet() self.agent.set_available_actions(actions) def rest(self, already_elapsed): rest_time = self.min_time_between_frames - already_elapsed if rest_time > 0: sleep(rest_time) def run(self): """Runs self.n_epochs epochs, where the agent's learning is reset for each new epoch. Each epoch lasts self.n_episodes or self.n_frames, whichever is defined. """ self.log.overall("Starting run") run_start = time() for epoch in xrange(self.n_epochs): self.agent.reset() self.n_epoch = epoch self._run_epoch() self.log.overall("End of run ({:.2f} s)".format(time() - run_start)) def _run_epoch(self): self.n_episode = 0 start = time() while not self._stop_condition_met(): self._run_episode() self.n_episode += 1 wall_time = time() - start frames = self.ale.getFrameNumber() self.log.epoch("Finished epoch after {:.2f} seconds".format(wall_time)) def _run_episode(self): self.ale.reset_game() self.agent.on_episode_start() total_reward = 0 episode_start = time() while (not self.ale.game_over()) and (not self._stop_condition_met()): timestep_start = time() action = self.agent.select_action() reward = self.ale.act(action) self.agent.receive_reward(reward) total_reward += reward self.rest(time() - timestep_start) wall_time = time() - episode_start self.agent.on_episode_end() # Stats format: CSV with epoch, episode, total_reward, n_frames, wall_time self.stats.write( self.n_epoch, self.n_episode, total_reward, self.ale.getEpisodeFrameNumber(), "{:.2f}".format(wall_time) ) def _stop_condition_met(self): if self.n_episodes: return self.n_episode >= self.n_episodes return self.ale.getFrameNumber() >= self.n_frames # Methods for state perception def get_screen(self): """Returns a matrix containing the current game screen in raw pixel data, i.e. before conversion to RGB. Handles reuse of np.array object, so it will overwrite what is in the old object""" return self._cached("raw", self.ale.getScreen) def get_screen_grayscale(self): """Returns an np.array with the screen grayscale colours. Handles reuse of np.array object, so it will overwrite what is in the old object. """ return self._cached("gray", self.ale.getScreenGrayscale) def get_screen_RGB(self): """Returns a numpy array with the screen's RGB colours. The first positions contain the red colours, followed by the green colours and then the blue colours""" return self._cached("rgb", self.ale.getScreenRGB) def get_RAM(self): """Returns a vector containing current RAM content (byte-level). Handles reuse of np.array object, so it will overwrite what is in the old object""" return self._cached("ram", self.ale.getRAM) def _cached(self, key, func): if key in self._object_cache: func(self._object_cache[key]) else: self._object_cache[key] = func() return self._object_cache[key] def dump_settings(self): import json settings = self.get_settings() path = os.path.join(self.results_dir, "settings") with open(path, "w") as f: json.dump(settings, f, indent=4) def get_settings(self): """Returns a dict representing the settings needed to reproduce this object and its subobjects """ return { "game_name": self.game_name, "n_epochs": self.n_epochs, "n_episodes": self.n_episodes, "n_frames": self.n_frames, "agent": self.agent.get_settings(), "results_dir": self.results_dir, "use_minimal_action_set": self.use_minimal_action_set, }
class AtariEmulator: def __init__(self, rom, visualization = False, save = False, windowName = 'AtariGame'): self.ale = ALEInterface() # self.ale.setInt(b'frame_skip', 1) self.ale.setInt(b"random_seed", 123) # self.ale.setFloat(b'repeat_action_probability', 0) # default = 0.25 self.ale.loadROM(b'roms/' + rom) self.legalActions = self.ale.getMinimalActionSet() self.life_lost = False self.mode = 'train' self.visualization = visualization and not save self.windowName = windowName self.save = save self.totalReward = 0 if self.visualization: cv2.namedWindow(self.windowName) elif self.save: self.index = 0 self.bestReward = 0 self.totalReward = 0 if os.path.exists('result'): shutil.rmtree('result') if os.path.exists('best_result'): shutil.rmtree('best_result') if not os.path.exists('result'): os.mkdir('result') if not os.path.exists('best_result'): os.mkdir('best_result') def start(self): # In train mode: life_lost = True but game is not over, don't restart the game if self.mode == 'test' or not self.life_lost or self.ale.game_over(): self.ale.reset_game() self.life_lost = False return cv2.resize(self.ale.getScreenGrayscale(), (84, 110))[26:] def isTerminal(self): if self.mode == 'train': return self.ale.game_over() or self.life_lost return self.ale.game_over() def next(self, action): # index of action int legalActions lives = self.ale.lives() # the remaining lives reward = 0 for i in range(4): # action repeat reward += self.ale.act(self.legalActions[action]) self.life_lost = (lives != self.ale.lives()) # after action, judge life lost if self.mode == 'train' and self.life_lost: reward -= 1 if self.isTerminal(): break self.totalReward += reward state = self.ale.getScreenGrayscale() rgb_state = self.ale.getScreenRGB() if self.visualization: cv2.imshow(self.windowName, rgb_state) cv2.waitKey(10) elif self.save: cv2.imwrite(os.path.join('result', '%04d.png') % self.index, rgb_state) self.index += 1 if self.isTerminal(): print('Scores: %d, index: %d' % (self.totalReward, self.index)) if self.totalReward > self.bestReward: self.bestReward = self.totalReward copyDir('result', 'best_result') self.index = 0 self.totalReward = 0 return cv2.resize(state, (84, 110))[26:], reward, self.isTerminal() def setMode(self, mode): self.mode = mode def randomStart(self, s_t): channels = s_t.shape[-1] self.start() for i in range(np.random.randint(channels, 30) + 1): s_t_plus_1, r_t, isTerminal = self.next(0) s_t[..., 0:channels-1] = s_t[..., 1:channels] s_t[..., -1] = s_t_plus_1 if isTerminal: self.start()
class Emulator: def __init__(self, rom_path, rom_name, visualize, actor_id, rseed, single_life_episodes=False): self.ale = ALEInterface() self.ale.setInt("random_seed", rseed * (actor_id + 1)) # For fuller control on explicit action repeat (>= ALE 0.5.0) self.ale.setFloat("repeat_action_probability", 0.0) # Disable frame_skip and color_averaging # See: http://is.gd/tYzVpj self.ale.setInt("frame_skip", 1) self.ale.setBool("color_averaging", False) self.ale.loadROM(rom_path + "/" + rom_name + ".bin") self.legal_actions = self.ale.getMinimalActionSet() self.screen_width, self.screen_height = self.ale.getScreenDims() #self.ale.setBool('display_screen', True) # Processed historcal frames that will be fed in to the network # (i.e., four 84x84 images) self.screen_images_processed = np.zeros( (IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) self.rgb_screen = np.zeros((self.screen_height, self.screen_width, 3), dtype=np.uint8) self.gray_screen = np.zeros((self.screen_height, self.screen_width, 1), dtype=np.uint8) self.frame_pool = np.empty((2, self.screen_height, self.screen_width)) self.current = 0 self.lives = self.ale.lives() self.visualize = visualize self.visualize_processed = False self.windowname = rom_name + ' ' + str(actor_id) if self.visualize: logger.debug("Opening emulator window...") #from skimage import io #io.use_plugin('qt') cv2.startWindowThread() cv2.namedWindow(self.windowname) logger.debug("Emulator window opened") if self.visualize_processed: logger.debug("Opening processed frame window...") cv2.startWindowThread() logger.debug("Processed frame window opened") cv2.namedWindow(self.windowname + "_processed") self.single_life_episodes = single_life_episodes def get_screen_image(self): """ Add screen (luminance) to frame pool """ # [screen_image, screen_image_rgb] = [self.ale.getScreenGrayscale(), # self.ale.getScreenRGB()] self.ale.getScreenGrayscale(self.gray_screen) self.ale.getScreenRGB(self.rgb_screen) self.frame_pool[self.current] = np.squeeze(self.gray_screen) self.current = (self.current + 1) % FRAMES_IN_POOL return self.rgb_screen def new_game(self): """ Restart game """ self.ale.reset_game() self.lives = self.ale.lives() if MAX_START_WAIT < 0: logger.debug("Cannot time travel yet.") sys.exit() elif MAX_START_WAIT > 0: wait = random.randint(0, MAX_START_WAIT) else: wait = 0 for _ in xrange(wait): self.ale.act(self.legal_actions[0]) def process_frame_pool(self): """ Preprocess frame pool """ img = None if BLEND_METHOD == "max_pool": img = np.amax(self.frame_pool, axis=0) #img resize(img[:210, :], (84, 84)) img = cv2.resize(img[:210, :], (84, 84), interpolation=cv2.INTER_LINEAR) img = img.astype(np.float32) img *= (1.0 / 255.0) return img # Reduce height to 210, if not so #cropped_img = img[:210, :] # Downsample to 110x84 #down_sampled_img = resize(cropped_img, (84, 84)) # Crop to 84x84 playing area #stackable_image = down_sampled_img[:, 26:110] #return stackable_image def action_repeat(self, a): """ Repeat action and grab screen into frame pool """ reward = 0 for i in xrange(ACTION_REPEAT): reward += self.ale.act(self.legal_actions[a]) new_screen_image_rgb = self.get_screen_image() return reward, new_screen_image_rgb def get_reshaped_state(self, state): return np.reshape(state, (1, IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) #return np.reshape(self.screen_images_processed, # (1, IMG_SIZE_X, IMG_SIZE_Y, NR_IMAGES)) def get_initial_state(self): """ Get the initial state """ self.new_game() for step in xrange(NR_IMAGES): reward, new_screen_image_rgb = self.action_repeat(0) self.screen_images_processed[:, :, step] = self.process_frame_pool() self.show_screen(new_screen_image_rgb) if self.is_terminal(): MAX_START_WAIT -= 1 return self.get_initial_state() return np.copy(self.screen_images_processed) #get_reshaped_state() def next(self, action): """ Get the next state, reward, and game over signal """ reward, new_screen_image_rgb = self.action_repeat(np.argmax(action)) self.screen_images_processed[:, :, 0:3] = \ self.screen_images_processed[:, :, 1:4] self.screen_images_processed[:, :, 3] = self.process_frame_pool() self.show_screen(new_screen_image_rgb) terminal = self.is_terminal() self.lives = self.ale.lives() return np.copy( self.screen_images_processed ), reward, terminal #get_reshaped_state(), reward, terminal def show_screen(self, image): """ Show visuals for raw and processed images """ if self.visualize: #io.imshow(image[:210, :], fancy=True) cv2.imshow(self.windowname, image[:210, :]) if self.visualize_processed: #io.imshow(self.screen_images_processed[:, :, 3], fancy=True) cv2.imshow(self.windowname + "_processed", self.screen_images_processed[:, :, 3]) def is_terminal(self): if self.single_life_episodes: return (self.is_over() or (self.lives > self.ale.lives())) else: return self.is_over() def is_over(self): return self.ale.game_over()
class AtariPlayer(gym.Env): """ A wrapper for ALE emulator, with configurations to mimic DeepMind DQN settings. Info: score: the accumulated reward in the current game gameOver: True when the current game is Over """ def __init__(self, rom_file, viz=0, frame_skip=4, nullop_start=30, live_lost_as_eoe=True, max_num_frames=0): """ Args: rom_file: path to the rom frame_skip: skip every k frames and repeat the action viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. nullop_start: start with random number of null ops. live_losts_as_eoe: consider lost of lives as end of episode. Useful for training. max_num_frames: maximum number of frames per episode. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_path('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Error) except AttributeError: if execute_only_once(): logger.warn("You're not using latest ALE") # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 30000)) self.ale.setInt(b"max_num_frames_per_episode", max_num_frames) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.action_space = spaces.Discrete(len(self.actions)) self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width), dtype=np.uint8) self._restart_episode() def get_action_meanings(self): return [ACTION_MEANING[i] for i in self.actions] def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def _current_state(self): """ :returns: a gray-scale (h, w) uint8 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): cv2.imshow(self.windowname, ret) cv2.waitKey(int(self.viz * 1000)) ret = ret.astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY)[:, :] return ret.astype('uint8') # to save some memory def _restart_episode(self): with _ALE_LOCK: self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def reset(self): if self.ale.game_over(): self._restart_episode() return self._current_state() def render(self, *args, **kwargs): pass # visualization for this env is through the viz= argument when creating the player def step(self, act): oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break isOver = self.ale.game_over() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives info = {'ale.lives': newlives} return self._current_state(), r, isOver, info
def main(): if len(sys.argv) < 2: dir_rom = '/Users/lguan/Documents/Study/Research/Atari-2600-Roms/K-P/ms_pacman.bin' else: dir_rom = sys.argv[1] ale = ALEInterface() # Get & Set the desired settings ale.setInt(b'random_seed', 123) # Set USE_SDL to true to display the screen. ALE must be compilied # with SDL enabled for this to work. On OSX, pygame init is used to # proxy-call SDL_main. USE_SDL = False if USE_SDL: # mac OS if sys.platform == 'darwin': pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): ale.setBool('sound', True) ale.setBool('display_screen', True) # Load the ROM file rom_file = str.encode(dir_rom) print('- Loading ROM - %s' % dir_rom) ale.loadROM(rom_file) print('- Complete loading ROM') (game_surface_width, game_surface_height) = ale.getScreenDims() print("game surface width/height: " + str(game_surface_width) + "/" + str(game_surface_height)) (display_width, display_height) = (800, 640) print 'display width/height', (display_width, display_height) available_action = ale.getLegalActionSet() print available_action # init pygame pygame.init() display_screen = pygame.display.set_mode((display_width, display_height)) pygame.display.set_caption( "Arcade Learning Environment Player Agent Display") # init clock clock = pygame.time.Clock() is_exit = False # Play 10 episodes for episode in range(10): if is_exit: break total_reward = 0 while not ale.game_over() and not is_exit: a = getActionFromKeyboard() # Apply an action and get the resulting reward reward = ale.act(a) total_reward += reward # clear screen display_screen.fill((0, 0, 0)) # render game surface renderGameSurface(ale, display_screen, (game_surface_width, game_surface_height)) # display related info displayRelatedInfo(display_screen, a, total_reward) pygame.display.flip() # process pygame event queue for event in pygame.event.get(): if event.type == pygame.QUIT: is_exit = True break if event.type == pygame.KEYDOWN and event.key == pygame.K_q: is_exit = True break # delay to 60fps clock.tick(60.) print('Episode %d ended with score: %d' % (episode, total_reward)) ale.reset_game()
class ALEEnvironment(): def __init__(self, rom_file, args): self.ale = ALEInterface() self.histLen = 4 if args.display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', 0.0) self.ale.setBool('color_averaging', args.color_averaging) #if args.random_seed: # self.ale.setInt('random_seed', args.random_seed) self.ale.setInt( 'random_seed', 0) #hoang addition to fix the random seed across all environment self.ale.loadROM(rom_file) if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() logger.info("Using minimal action set with size %d" % len(self.actions)) else: self.actions = self.ale.getLegalActionSet() logger.info("Using full action set with size %d" % len(self.actions)) logger.debug("Actions: " + str(self.actions)) self.screen_width = args.screen_width self.screen_height = args.screen_height self.mode = "train" self.life_lost = False self.initSrcreen = self.getScreen() print("size of screen is:", self.initSrcreen.shape) im = Image.fromarray(self.initSrcreen) im.save('initial_screen.jpeg') im = Image.open('initial_screen.jpeg') pix = im.load() # print "devil's color", pix[13,62] # print "agent's color", pix[42,33] # exit() # draw = ImageDraw.Draw(im) # draw.rectangle([(37, 29),(48, 37)], outline = 'red') # draw.rectangle([(69, 68), (73, 71)], outline = 'white') # draw.rectangle([(7, 41), (11, 45)], outline = 'white') # draw.rectangle([(11, 58), (15, 66)], outline = 'white') # draw.rectangle([(70, 20), (73, 35)], outline='white') #right door # draw.rectangle([(11, 68), (15, 71)], outline='white') # im.save('first_subgoal_box.jpeg') # exit() # use this tool to get bounding box: http://nicodjimenez.github.io/boxLabel/annotate.html self.goalSet = [] # goal 0 self.goalSet.append([[69, 68], [ 73, 71 ]]) # Lower Right Ladder. This is the box for detecting first subgoal # self.goalSet.append([[11, 58], [15, 66]]) # lower left ladder 3 # self.goalSet.append([[11, 68], [15, 71]]) # lower left ladder 3 # goal 2 self.goalSet.append([[7, 41], [11, 45]]) # Key. This will be second sub goal self.goalSet.append([[11, 68], [15, 71]]) # lower left ladder 3 # goal 4 self.goalSet.append( [[69, 68], [73, 71]]) # Lower Right Ladder again, this will be the third subgoal # goal 6 self.goalSet.append([[70, 20], [73, 35]]) # Right Door. This will be the 4th subgoal self.goalCenterLoc = [] for goal in self.goalSet: goalCenter = [ float(goal[0][0] + goal[1][0]) / 2, float(goal[0][1] + goal[1][1]) / 2 ] self.goalCenterLoc.append(goalCenter) self.agentOriginLoc = [42, 33] self.agentLastX = 42 self.agentLastY = 33 self.devilLastX = 0 self.devilLastY = 0 self.reachedGoal = [0, 0, 0, 0, 0, 0, 0] self.histState = self.initializeHistState() def initializeHistState(self): histState = np.concatenate((self.getState(), self.getState()), axis=2) histState = np.concatenate((histState, self.getState()), axis=2) histState = np.concatenate((histState, self.getState()), axis=2) return histState def numActions(self): return len(self.actions) def resetGoalReach(self): self.reachedGoal = [0, 0, 0, 0, 0, 0, 0, 0] def restart(self): self.ale.reset_game() self.life_lost = False self.reachedGoal = [0, 0, 0, 0, 0, 0, 0] for i in range(19): self.act(0) #wait for initialization self.histState = self.initializeHistState() self.agentLastX = self.agentOriginLoc[0] self.agentLastY = self.agentOriginLoc[1] """ def restart(self): # In test mode, the game is simply initialized. In train mode, if the game # is in terminal state due to a life loss but not yet game over, then only # life loss flag is reset so that the next game starts from the current # state. Otherwise, the game is simply initialized. if ( self.mode == 'test' or not self.life_lost or # `reset` called in a middle of episode self.ale.game_over() # all lives are lost ): self.ale.reset_game() self.life_lost = False self.reachedGoal = [0, 0, 0] for i in range(19): self.act(0) #wait for initialization self.histState = self.initializeHistState() self.agentLastX = self.agentOriginLoc[0] self.agentLastY = self.agentOriginLoc[1] """ def beginNextLife(self): self.life_lost = False self.reachedGoal = [0, 0, 0, 0, 0, 0, 0] for i in range(19): self.act(0) #wait for initialization self.histState = self.initializeHistState() self.agentLastX = self.agentOriginLoc[0] self.agentLastY = self.agentOriginLoc[1] def act(self, action): lives = self.ale.lives() reward = self.ale.act(self.actions[action]) self.life_lost = (not lives == self.ale.lives()) currState = self.getState() self.histState = np.concatenate((self.histState[:, :, 1:], currState), axis=2) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return resized def getScreenRGB(self): screen = self.ale.getScreenRGB() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) #resized = screen return resized def getAgentLoc(self, img): # img = self.getScreenRGB() man = [200, 72, 72] mask = np.zeros(np.shape(img)) mask[:, :, 0] = man[0] mask[:, :, 1] = man[1] mask[:, :, 2] = man[2] diff = img - mask indxs = np.where(diff == 0) diff[np.where(diff < 0)] = 0 diff[np.where(diff > 0)] = 0 diff[indxs] = 255 if (np.shape(indxs[0])[0] == 0): mean_x = self.agentLastX mean_y = self.agentLastY else: mean_y = np.sum(indxs[0]) / np.shape(indxs[0])[0] mean_x = np.sum(indxs[1]) / np.shape(indxs[1])[0] self.agentLastX = mean_x self.agentLastY = mean_y return (mean_x, mean_y) def getDevilLoc(self, img): # img = self.getScreenRGB() # man = [0, 16, 2] devilColor = [236, 236, 236] mask = np.zeros(np.shape(img)) mask[:, :, 0] = devilColor[0] mask[:, :, 1] = devilColor[1] mask[:, :, 2] = devilColor[2] diff = img - mask indxs = np.where(diff == 0) diff[np.where(diff < 0)] = 0 diff[np.where(diff > 0)] = 0 diff[indxs] = 255 if (np.shape(indxs[0])[0] == 0): mean_x = self.devilLastX mean_y = self.devilLastY else: mean_y = np.sum(indxs[0]) / np.shape(indxs[0])[0] mean_x = np.sum(indxs[1]) / np.shape(indxs[1])[0] self.devilLastX = mean_x self.devilLastY = mean_y return (mean_x, mean_y) def distanceReward(self, lastGoal, goal): if (lastGoal == -1): lastGoalCenter = self.agentOriginLoc else: lastGoalCenter = self.goalCenterLoc[lastGoal] goalCenter = self.goalCenterLoc[goal] agentX, agentY = self.getAgentLoc() dis = np.sqrt((goalCenter[0] - agentX) * (goalCenter[0] - agentX) + (goalCenter[1] - agentY) * (goalCenter[1] - agentY)) disLast = np.sqrt((lastGoalCenter[0] - agentX) * (lastGoalCenter[0] - agentX) + (lastGoalCenter[1] - agentY) * (lastGoalCenter[1] - agentY)) disGoals = np.sqrt((goalCenter[0] - lastGoalCenter[0]) * (goalCenter[0] - lastGoalCenter[0]) + (goalCenter[1] - lastGoalCenter[1]) * (goalCenter[1] - lastGoalCenter[1])) return 0.001 * (disLast - dis) / disGoals # add color channel for input of network def getState(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return np.reshape(resized, (84, 84, 1)) def getStackedState(self): return self.histState def isTerminal(self): if self.mode == 'train': return self.ale.game_over() or self.life_lost return self.ale.game_over() def isGameOver(self): return self.ale.game_over() def isLifeLost(self): return self.life_lost def reset(self): self.ale.reset_game() self.life_lost = False def goalReached(self, goal): # if goal in [0,2,4,6]: # those are original task where bounding boxes are used to detect the location of agents subset = [ 0, 2, 3, 4, 6 ] # those are original task where bounding boxes are used to detect the location of agents if goal in subset: # goal_index = goal/2 goal_index = subset.index(goal) goalPosition = self.goalSet[goal_index] goalScreen = self.initSrcreen stateScreen = self.getScreen() count = 0 for y in range(goalPosition[0][0], goalPosition[1][0]): for x in range(goalPosition[0][1], goalPosition[1][1]): if goalScreen[x][y] != stateScreen[x][y]: count = count + 1 # 30 is total number of pixels of agent if float(count) / 30 > 0.3: self.reachedGoal[goal] = 1 return True if goal == 1: # detect if agent is to the left of the devil # return self.agent_left_devil() return self.detect_left_ladder() ############## -- DML modified -- ########### # if goal == 4: # # detect if agent is to the right of the devil # # return self.agent_right_devil() # return self.detect_right_ladder() ################# -- end -- ########### if goal == 5: # detect if the agent is back to the original location return self.original_location_reached() return False def detect_right_ladder(self): goalPosition = self.goalSet[0] goalScreen = self.initSrcreen stateScreen = self.getScreen() count = 0 for y in range(goalPosition[0][0], goalPosition[1][0]): for x in range(goalPosition[0][1], goalPosition[1][1]): if goalScreen[x][y] != stateScreen[x][y]: count = count + 1 # 30 is total number of pixels of agent if float(count) / 30 > 0.3: goal = 5 self.reachedGoal[goal] = 1 return True return False def detect_left_ladder(self): goalPosition = self.goalSet[2] goalScreen = self.initSrcreen stateScreen = self.getScreen() count = 0 for y in range(goalPosition[0][0], goalPosition[1][0]): for x in range(goalPosition[0][1], goalPosition[1][1]): if goalScreen[x][y] != stateScreen[x][y]: count = count + 1 # 30 is total number of pixels of agent if float(count) / 30 > 0.3: goal = 5 self.reachedGoal[goal] = 1 return True return False def original_location_reached(self): img = self.getScreenRGB() (x, y) = self.getAgentLoc(img) # print "Agent's location:",x,y if abs(x - 42) <= 2 and abs(y - 33) <= 2: return True else: return False def pause(self): os.system('read -s -n 1 -p "Press any key to continue...\n"') def agent_left_devil(self): img = self.ale.getScreenRGB() (x, y) = self.getAgentLoc(img) (a, b) = self.getDevilLoc(img) # print "Agent's location:",x,y # print "Devil's location:", a,b if (a - x > 40) and (abs(y - b) <= 40): return True else: return False def agent_right_devil(self): img = self.getScreenRGB() (x, y) = self.getAgentLoc(img) (a, b) = self.getDevilLoc(img) # print "Agent's location:",x,y # print "Devil's location:",a,b # if (x-a > 25) and (abs(y-b) <= 40): if (x - a > 40) and (abs(y - b) <= 40): return True else: return False def goalNotReachedBefore(self, goal): if (self.reachedGoal[goal] == 1): return False return True
class KungFuMaster(object): def __init__( self, rom='/home/josema/AI/ALE/Arcade-Learning-Environment/Roms/kung_fu_master.bin', trainsessionname='test'): self.agent = None self.isAuto = True self.gui_visible = False self.userquit = False self.optimalPolicyUser = False # optimal policy set by user self.trainsessionname = trainsessionname self.elapsedtime = 0 # elapsed time for this experiment self.keys = 0 # Configuration self.pause = False # game is paused self.debug = False self.sleeptime = 0.0 self.command = 0 self.iteration = 0 self.cumreward = 0 self.cumreward100 = 0 # cum reward for statistics self.cumscore100 = 0 self.ngoalreached = 0 self.max_level = 1 self.hiscore = 0 self.hireward = -1000000 self.resfile = open("data/" + self.trainsessionname + ".dat", "a+") self.legal_actions = 0 self.rom = rom self.key_status = [] def init(self, agent): # init after creation (uses args set from cli) self.ale = ALEInterface() self.ale.setInt('random_seed', 123) ram_size = self.ale.getRAMSize() self.ram = np.zeros((ram_size), dtype=np.uint8) if (self.gui_visible): os.environ['SDL_VIDEO_CENTERED'] = '1' if sys.platform == 'darwin': pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): pygame.init() self.ale.setBool('sound', True) self.ale.setBool('display_screen', False) self.ale.loadROM(self.rom) self.legal_actions = self.ale.getLegalActionSet() if (self.gui_visible): (self.screen_width, self.screen_height) = self.ale.getScreenDims() print("width/height: " + str(self.screen_width) + "/" + str(self.screen_height)) (display_width, display_height) = (1024, 420) self.screen = pygame.display.set_mode( (display_width, display_height)) pygame.display.set_caption( "Reinforcement Learning - Sapienza - Jose M Salas") self.numpy_surface = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.game_surface = pygame.Surface( (self.screen_width, self.screen_height)) pygame.display.flip() #init clock self.clock = pygame.time.Clock() self.agent = agent self.nactions = len( self.legal_actions ) # 0: not moving, 1: left, 2: right, 3: up, 4: down for i in range(self.nactions): self.key_status.append(False) print(self.nactions) # ns = 89999 # Number of statuses if we use enemy type ram info without level number #FINAL ns = 489999 # Number of statuses if we use enemy type ram info ns = 4899999 # Number of statuses if we use enemy type ram info # ns = 48999 print('Number of states: %d' % ns) self.agent.init(ns, self.nactions) # 1 for RA not used here def initScreen(self): if (self.gui_visible): if sys.platform == 'darwin': pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): pygame.init() self.ale.setBool('sound', True) self.ale.setBool('display_screen', False) if (self.gui_visible): (self.screen_width, self.screen_height) = self.ale.getScreenDims() print("width/height: " + str(self.screen_width) + "/" + str(self.screen_height)) (display_width, display_height) = (1024, 420) self.screen = pygame.display.set_mode( (display_width, display_height)) pygame.display.set_caption( "Reinforcement Learning - Sapienza - Jose M Salas") self.numpy_surface = np.zeros( (self.screen_height, self.screen_width, 3), dtype=np.uint8) self.game_surface = pygame.Surface( (self.screen_width, self.screen_height)) pygame.display.flip() #init clock self.clock = pygame.time.Clock() def reset(self): self.pos_x = 0 self.pos_y = 0 # Kung fu master observations self.enemy_pos = 0 self.n_enemies = 0 self.my_pos = 0 self.danger_pos = 0 self.danger_type = 0 self.enemy_type = 0 # 0, 1, 2, 3, 80, 81, 82, 40 self.blocked = 0 self.prev_blocked = 0 self.hold_hit = 0 self.time_left1 = 0 self.time_left2 = 0 self.my_energy = 39 self.previous_my_energy = 39 self.lifes = 3 self.previous_lifes = 3 self.got_hit = 0 self.got_blocked = 0 self.got_unblocked = 0 self.still_blocked = False self.starting_pos = 0 self.level = 1 self.score = 0 self.cumreward = 0 self.cumscore = 0 self.action_reward = 0 self.current_reward = 0 # accumulate reward over all events happened during this action until next different state self.prev_state = None # previous state self.firstAction = True # first action of the episode self.finished = False # episode finished self.newstate = True # new state reached self.numactions = 0 # number of actions in this episode self.iteration += 1 self.agent.optimal = self.optimalPolicyUser or ( self.iteration % 100 ) == 0 # False #(random.random() < 0.5) # choose greedy action selection for the entire episode def pair_function(self): # Combine the number of enemies, player blocked and danger type information into 7 different states if self.n_enemies > 0: self.danger_type = 0 # print (str(self.n_enemies) + " - " + str(self.danger_type) + ' - ' + str(self.blocked)) pair = (int)( (0.5 * (self.n_enemies + self.danger_type) * (self.n_enemies + self.danger_type + 1) + self.danger_type + 1) * (1 - (self.blocked / 128))) if pair > 8: return 5 #game not started yet else: return pair def enemy_type_s(self): if self.enemy_type > 127: return (self.enemy_type - 128 + 4) elif self.enemy_type == 64: return 8 else: return self.enemy_type def getstate(self): # print ('enemy type: ' + str(self.enemy_type_s()) + 'level: ' + str(self.level -1) ) x = (int)((self.level - 1) * 1000000 + self.pair_function() * 100000 + (self.enemy_type_s() * 10000) + np.rint(self.my_pos / 32) * 1000 + np.rint(self.enemy_pos / 32) * 100 + np.rint(self.danger_pos / 32) * 10 + np.rint(self.hold_hit / 16)) #3FINAL x = (int)((self.enemy_type_s()*1000) + (self.level-1)*100000 + self.pair_function()*10000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) #2NO LEVEL x = (int)((self.enemy_type_s()*1000) + self.pair_function()*10000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) #1NO ENEMY TYPE x = (int)((self.level-1)*10000 + self.pair_function()*1000 + np.rint(self.enemy_pos/32)*100 + np.rint(self.danger_pos/32)*10 + np.rint(self.hold_hit/16)) return x def goal_reached(self): #return (self.my_energy>0 and self.time_left1==0 and self.time_left2<5) #and self.my_energy==39) return (self.level == 5) def update(self, a): self.command = a # Update RAM self.ale.getRAM(self.ram) # Get info from RAM self.enemy_pos = self.ram[72] self.n_enemies = self.ram[91] self.danger_pos = self.ram[73] self.my_pos = self.ram[74] self.hold_hit = self.ram[77] self.enemy_type = self.ram[54] if self.level < self.ram[31]: self.starting_pos = self.ram[74] self.level = self.ram[31] self.max_level = max(self.level, self.max_level) # Danger/Enemy position: # 49 = no danger # 50 = danger approaching from left # 208 = danger approaching from right # ram[96] = 6, danger comes from top # ram[96] = 29, danger comes from bottom # ram[96] = 188, none if self.ram[96] == 6: self.danger_type = 0 elif self.ram[96] == 29: self.danger_type = 1 else: self.danger_type = 2 self.time_left1 = self.ram[27] self.time_left2 = self.ram[28] self.previous_my_energy = self.my_energy self.my_energy = self.ram[75] if self.my_energy < self.previous_my_energy and not self.still_blocked and self.ram[ 34] == 0: self.got_hit = STATES['GotHit'] else: self.got_hit = 0 self.previous_lifes = self.lifes self.lifes = self.ram[29] self.prev_blocked = self.blocked self.blocked = self.ram[61] if self.blocked > self.prev_blocked and not self.still_blocked: self.got_blocked = STATES['GotBlocked'] self.still_blocked = True self.got_unblocked = 0 elif self.blocked < self.prev_blocked and self.still_blocked: self.got_unblocked = STATES['GotUnblocked'] self.still_blocked = False self.got_blocked = 0 else: self.got_blocked = 0 self.got_unblocked = 0 # print ('enemy_pos=' +str(self.enemy_pos) + ' - danger_pos=' + str(self.danger_pos) + ' - my_position=' # + str(self.my_pos) + ' - my_energy=' + str(self.my_energy) + ' - blocked=' + str(self.blocked) + ' - danger_type=' + str(self.danger_type)) self.prev_state = self.getstate() # remember previous state # print " == Update start ",self.prev_state," action",self.command self.current_reward = 0 # accumulate reward over all events happened during this action until next different state #print('self.current_reward = 0') self.numactions += 1 # total number of actions axecuted in this episode # while (self.prev_state == self.getstate()): if (self.firstAction): self.starting_pos = self.ram[74] self.firstAction = False self.current_reward = self.ale.act(a) else: self.current_reward = self.ale.act(a) if self.ram[34] == 0: #only when playing if (a == 3 and self.starting_pos < self.my_pos) or ( a == 4 and self.starting_pos > self.my_pos): self.action_reward = STATES['MoveFW'] elif (a == 3 and self.starting_pos > self.my_pos) or ( a == 4 and self.starting_pos < self.my_pos): self.action_reward = STATES['MoveBW'] else: self.action_reward = STATES['NotMoving'] self.score += self.current_reward self.current_reward += self.action_reward # print('score= ' + str(self.score) + ' current reward=' +str(np.rint(self.current_reward))+ ' - energy=' + str(self.my_energy/39.0) + # ' - got_hot='+ str(self.got_hit) + ' - got_blocked=' + str(self.got_blocked) + ' - got_unblocked=' + str(self.got_unblocked)) # check if episode terminated #self.draw_screen if self.goal_reached(): self.current_reward += STATES['Alive'] self.ngoalreached += 1 #self.ale.reset_game() self.finished = True if (self.ale.game_over()): self.current_reward += STATES['Dead'] if self.level > 1: print('game over in level ' + str(self.level)) if self.my_energy > 0 and self.lifes == 3: print('Game over alive????') self.ale.reset_game() self.finished = True if self.level > 2: if self.gui_visible == False: self.gui_visible = True self.initScreen() #print " ** Update end ",self.getstate(), " prev ",self.prev_state def input(self): self.isPressed = False if self.gui_visible: for event in pygame.event.get(): if event.type == pygame.QUIT: return False if event.type == pygame.KEYDOWN: if event.key == pygame.K_SPACE: self.pause = not self.pause print "Game paused: ", self.pause elif event.key == pygame.K_a: self.isAuto = not self.isAuto self.sleeptime = int(self.isAuto) * 0.07 elif event.key == pygame.K_s: self.sleeptime = 1.0 self.agent.debug = False elif event.key == pygame.K_d: self.sleeptime = 0.07 self.agent.debug = False elif event.key == pygame.K_f: self.sleeptime = 0.005 self.agent.debug = False elif event.key == pygame.K_g: self.sleeptime = 0.0 self.agent.debug = False elif event.key == pygame.K_o: self.optimalPolicyUser = not self.optimalPolicyUser print "Best policy: ", self.optimalPolicyUser elif event.key == pygame.K_q: self.userquit = True print "User quit !!!" else: pressed = pygame.key.get_pressed() self.keys = 0 self.keys |= pressed[pygame.K_UP] self.keys |= pressed[pygame.K_DOWN] << 1 self.keys |= pressed[pygame.K_LEFT] << 2 self.keys |= pressed[pygame.K_RIGHT] << 3 self.keys |= pressed[pygame.K_z] << 4 self.command = key_action_tform_table[self.keys] self.key_status[self.command] = True if event.type == pygame.KEYUP: pressed = pygame.key.get_pressed() self.keys = 0 self.keys |= pressed[pygame.K_UP] self.keys |= pressed[pygame.K_DOWN] << 1 self.keys |= pressed[pygame.K_LEFT] << 2 self.keys |= pressed[pygame.K_RIGHT] << 3 self.keys |= pressed[pygame.K_z] << 4 self.command = key_action_tform_table[self.keys] self.key_status[self.command] = False if not (True in self.key_status): self.command = 0 return True def getUserAction(self): return self.command def getreward(self): r = np.rint( self.current_reward ) + self.got_hit + self.got_blocked + self.got_unblocked - np.rint( self.blocked / 128) self.cumreward += r return r def print_report(self, printall=False): toprint = printall ch = ' ' if (self.agent.optimal): ch = '*' toprint = True s = 'Iter %6d, sc: %3d, l: %d, na: %4d, r: %5d %c' % ( self.iteration, self.score, self.level, self.numactions, self.cumreward, ch) if self.score > self.hiscore: self.hiscore = self.score s += ' HISCORE ' toprint = True if self.cumreward > self.hireward: self.hireward = self.cumreward s += ' HIREWARD ' toprint = True if (toprint): print(s) self.cumreward100 += self.cumreward self.cumscore100 += self.score numiter = 100 if (self.iteration % numiter == 0): #self.doSave() pgoal = float(self.ngoalreached * 100) / numiter print( '----------------------------------------------------------------------------------------------------------------------' ) print( "%s %6d avg last 100: reward %d | score %.2f | level %d | p goals %.1f %%" % (self.trainsessionname, self.iteration, self.cumreward100 / 100, float(self.cumscore100) / 100, self.max_level, pgoal)) print( '----------------------------------------------------------------------------------------------------------------------' ) self.cumreward100 = 0 self.cumscore100 = 0 self.ngoalreached = 0 sys.stdout.flush() self.resfile.write( "%d,%d,%d,%d\n" % (self.score, self.cumreward, self.goal_reached(), self.numactions)) self.resfile.flush() def draw(self): if self.gui_visible: self.screen.fill((0, 0, 0)) self.ale.getScreenRGB(self.numpy_surface) pygame.surfarray.blit_array( self.game_surface, np.transpose(self.numpy_surface, (1, 0, 2))) # pygame.pixelcopy.array_to_surface(self.game_surface, np.transpose(self.numpy_surface,(1,0,2))) self.screen.blit( pygame.transform.scale2x( pygame.transform.scale( self.game_surface, (self.screen_height, self.screen_height))), (0, 0)) #Display ram bytes font = pygame.font.SysFont("Ubuntu Mono", 32) text = font.render("RAM: ", 1, (255, 208, 208)) self.screen.blit(text, (430, 10)) font = pygame.font.SysFont("Ubuntu Mono", 25) height = font.get_height() * 1.2 line_pos = 40 ram_pos = 0 while (ram_pos < 128): ram_string = ''.join([ "%02X " % self.ram[x] for x in range(ram_pos, min(ram_pos + 16, 128)) ]) text = font.render(ram_string, 1, (255, 255, 255)) self.screen.blit(text, (440, line_pos)) line_pos += height ram_pos += 16 #display current action font = pygame.font.SysFont("Ubuntu Mono", 32) text = font.render("Current Action: " + str(self.command), 1, (208, 208, 255)) height = font.get_height() * 1.2 self.screen.blit(text, (430, line_pos)) line_pos += height #display reward font = pygame.font.SysFont("Ubuntu Mono", 30) text = font.render("Total Reward: " + str(self.cumreward), 1, (208, 255, 255)) self.screen.blit(text, (430, line_pos)) pygame.display.flip() # clock.tick(60.) else: return 0 def quit(self): self.resfile.close() pygame.quit()
class ALEEnvironment(Environment): def __init__(self, rom_file, args): from ale_python_interface import ALEInterface self.ale = ALEInterface() if args.display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) if args.random_seed: self.ale.setInt('random_seed', args.random_seed) if args.record_screen_path: if not os.path.exists(args.record_screen_path): logger.info("Creating folder %s" % args.record_screen_path) os.makedirs(args.record_screen_path) logger.info("Recording screens to %s", args.record_screen_path) self.ale.setString('record_screen_dir', args.record_screen_path) if args.record_sound_filename: logger.info("Recording sound to %s", args.record_sound_filename) self.ale.setBool('sound', True) self.ale.setString('record_sound_filename', args.record_sound_filename) self.ale.loadROM(rom_file) if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() logger.info("Using minimal action set with size %d" % len(self.actions)) else: self.actions = self.ale.getLegalActionSet() logger.info("Using full action set with size %d" % len(self.actions)) logger.debug("Actions: " + str(self.actions)) self.screen_width = args.screen_width self.screen_height = args.screen_height def numActions(self): return len(self.actions) def restart(self): self.ale.reset_game() def act(self, action): reward = self.ale.act(self.actions[action]) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return resized def isTerminal(self): return self.ale.game_over()
def train_agent(gamepath, agent, n_episodes, display_screen, record_weights, reduce_exploration_prob_amount, n_frames_to_skip): """ :description: trains an agent to play a game :type gamepath: string :param gamepath: path to the binary of the game to be played :type agent: subclass RLAlgorithm :param agent: the algorithm/agent that learns to play the game :type n_episodes: int :param n_episodes: number of episodes of the game on which to train """ # load the ale interface to interact with ale = ALEInterface() ale.setInt('random_seed', 42) # display/recording settings, doesn't seem to work currently recordings_dir = './recordings/breakout/' # previously "USE_SDL" if display_screen: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX #ale.setString("record_screen_dir", recordings_dir); elif sys.platform.startswith('linux'): ale.setBool('sound', True) ale.setBool('display_screen', True) ale.loadROM(gamepath) ale.setInt("frame_skip", n_frames_to_skip) screen_preprocessor = screen_utils.RGBScreenPreprocessor() rewards = [] best_reward = 0 print('starting training...') for episode in xrange(n_episodes): action = 0 reward = 0 newAction = None total_reward = 0 counter = 0 lives = ale.lives() screen = np.zeros((32, 32, 3), dtype=np.int8) state = { "screen" : screen, "objects" : None, "prev_objects": None, "prev_action": 0, "action": 0 } while not ale.game_over(): # if newAction is None then we are training an off-policy algorithm # otherwise, we are training an on policy algorithm if newAction is None: action = agent.getAction(state) else: action = newAction reward += ale.act(action) if ale.lives() < lives: lives = ale.lives() reward -= 1 total_reward += reward new_screen = ale.getScreenRGB() new_screen = screen_preprocessor.preprocess(new_screen) new_state = {"screen": new_screen, "objects": None, "prev_objects": state["objects"], "prev_action": state["action"], "action": action} newAction = agent.incorporateFeedback(state, action, reward, new_state) state = new_state reward = 0 rewards.append(total_reward) if total_reward > best_reward and record_weights: best_reward = total_reward print("Best reward: {}".format(total_reward)) if episode % PRINT_TRAINING_INFO_PERIOD == 0: print '\n############################' print '### training information ###' print("Average reward: {}".format(np.mean(rewards))) print("Last 50: {}".format(np.mean(rewards[-NUM_EPISODES_AVERAGE_REWARD_OVER:]))) print("Exploration probability: {}".format(agent.explorationProb)) print('action: {}'.format(action)) print('size of weights dict: {}'.format(len(agent.weights))) print('current objects: {}'.format(state['objects'])) print('previous objects: {}'.format(state['prev_objects'])) avg_feat_weight = np.mean([v for k,v in agent.weights.iteritems()]) print('average feature weight: {}'.format(avg_feat_weight)) print '############################' print '############################\n' if episode != 0 and episode % RECORD_WEIGHTS_PERIOD == 0 and record_weights: file_utils.save_rewards(rewards, filename='episode-{}-{}-rewards'.format(episode, type(agent).__name__)) file_utils.save_weights(agent.weights, filename='episode-{}-{}-weights'.format(episode, type(agent).__name__)) if agent.explorationProb > MINIMUM_EXPLORATION_EPSILON: agent.explorationProb -= reduce_exploration_prob_amount print('episode: {} ended with score: {}'.format(episode, total_reward)) ale.reset_game() return rewards
class ALE(Environment): """ Arcade Learning Environment (ALE). https://github.com/mgbellemare/Arcade-Learning-Environment """ def __init__(self, rom, frame_skip=1, repeat_action_probability=0.0, loss_of_life_termination=False, loss_of_life_reward=0, display_screen=False, seed=np.random.RandomState()): """ Initialize ALE. Args: rom: Rom filename and directory. frame_skip: Repeat action for n frames. Default 1. repeat_action_probability: Repeats last action with given probability. Default 0. loss_of_life_termination: Signals a terminal state on loss of life. Default False. loss_of_life_reward: Reward/Penalty on loss of life (negative values are a penalty). Default 0. display_screen: Displays the emulator screen. Default False. seed: Random seed """ self.ale = ALEInterface() self.rom = rom self.ale.setBool(b'display_screen', display_screen) self.ale.setInt(b'random_seed', seed.randint(0, 9999)) self.ale.setFloat(b'repeat_action_probability', repeat_action_probability) self.ale.setBool(b'color_averaging', False) self.ale.setInt(b'frame_skip', frame_skip) # All set commands must be done before loading the ROM. self.ale.loadROM(rom.encode()) # Setup gamescreen object. width, height = self.ale.getScreenDims() self.gamescreen = np.empty((height, width, 3), dtype=np.uint8) self.frame_skip = frame_skip # Setup action converter. # ALE returns legal action indexes, convert these to just numbers. self.action_inds = self.ale.getMinimalActionSet() # Setup lives self.loss_of_life_reward = loss_of_life_reward self.cur_lives = self.ale.lives() self.loss_of_life_termination = loss_of_life_termination self.life_lost = False def __str__(self): return 'ALE({})'.format(self.rom) def close(self): self.ale = None def reset(self): self.ale.reset_game() self.cur_lives = self.ale.lives() self.life_lost = False # Clear gamescreen. self.gamescreen = np.empty(self.gamescreen.shape, dtype=np.uint8) return self.current_state def execute(self, action): # Convert action to ale action. ale_action = self.action_inds[action] # Get reward and process terminal & next state. rew = self.ale.act(ale_action) if self.loss_of_life_termination or self.loss_of_life_reward != 0: new_lives = self.ale.lives() if new_lives < self.cur_lives: self.cur_lives = new_lives self.life_lost = True rew += self.loss_of_life_reward terminal = self.is_terminal state_tp1 = self.current_state return state_tp1, terminal, rew @property def states(self): return dict(shape=self.gamescreen.shape, type=float) @property def actions(self): return dict(type='int', num_actions=len(self.action_inds), names=self.action_names) @property def current_state(self): self.gamescreen = self.ale.getScreenRGB(self.gamescreen) return np.copy(self.gamescreen) @property def is_terminal(self): if self.loss_of_life_termination and self.life_lost: return True else: return self.ale.game_over() @property def action_names(self): action_names = [ 'No-Op', 'Fire', 'Up', 'Right', 'Left', 'Down', 'Up Right', 'Up Left', 'Down Right', 'Down Left', 'Up Fire', 'Right Fire', 'Left Fire', 'Down Fire', 'Up Right Fire', 'Up Left Fire', 'Down Right Fire', 'Down Left Fire' ] return np.asarray(action_names)[self.action_inds]
def train(gamepath, n_episodes, display_screen, record_weights, reduce_exploration_prob_amount, n_frames_to_skip, exploration_prob, verbose, discount, learning_rate, load_weights, frozen_target_update_period, use_replay_mem): """ :description: trains an agent to play a game :type gamepath: string :param gamepath: path to the binary of the game to be played :type n_episodes: int :param n_episodes: number of episodes of the game on which to train display_screen : whether or not to display the screen of the game record_weights : whether or not to save the weights of the nextwork reduce_exploration_prob_amount : amount to reduce exploration prob each episode to not reduce exploration_prob set to 0 n_frames_to_skip : how frequently to determine a new action to use exploration_prob : probability of choosing a random action verbose : whether or not to print information about the run periodically discount : discount factor used in learning learning_rate : the scaling factor for the sgd update load_weights : whether or not to load weights for the network (set the files directly below) frozen_target_update_period : the number of episodes between reseting the target of the network """ # load the ale interface to interact with ale = ALEInterface() ale.setInt('random_seed', 42) # display/recording settings, doesn't seem to work currently recordings_dir = './recordings/breakout/' # previously "USE_SDL" if display_screen: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX #ale.setString("record_screen_dir", recordings_dir); elif sys.platform.startswith('linux'): ale.setBool('sound', True) ale.setBool('display_screen', True) ale.loadROM(gamepath) ale.setInt("frame_skip", n_frames_to_skip) # real actions for breakout are [0,1,3,4] real_actions = ale.getMinimalActionSet() # use a list of actions [0,1,2,3] to index into the array of real actions actions = np.arange(len(real_actions)) # these theano variables are used to define the symbolic input of the network features = T.dvector('features') action = T.lscalar('action') reward = T.dscalar('reward') next_features = T.dvector('next_features') # load weights by file name # currently must be loaded by individual hidden layers if load_weights: hidden_layer_1 = file_utils.load_model('weights/hidden0_replay.pkl') hidden_layer_2 = file_utils.load_model('weights/hidden1_replay.pkl') else: # defining the hidden layer network structure # the n_hid of a prior layer must equal the n_vis of a subsequent layer # for q-learning the output layer must be of len(actions) hidden_layer_1 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, n_hid=NNET_INPUT_DIMENSION, layer_name='hidden1', activation='relu') hidden_layer_2 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, n_hid=NNET_INPUT_DIMENSION, layer_name='hidden2', activation='relu') hidden_layer_3 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, n_hid=len(actions), layer_name='hidden3', activation='relu') # the output layer is currently necessary when using tanh units in the # hidden layer in order to prevent a theano warning # currently the relu unit setting of the hidden and output layers is leaky w/ alpha=0.01 output_layer = OutputLayer(layer_name='output', activation='relu') # pass a list of layers to the constructor of the network (here called "mlp") layers = [hidden_layer_1, hidden_layer_2, hidden_layer_3, output_layer] qnetwork = QNetwork(layers, discount=discount, learning_rate=learning_rate) # this call gets the symbolic output of the network # along with the parameter updates expected loss, updates = qnetwork.get_loss_and_updates(features, action, reward, next_features) # this defines the theano symbolic function used to train the network # 1st argument is a list of inputs, here the symbolic variables above # 2nd argument is the symbolic output expected # 3rd argument is the dictionary of parameter updates # 4th argument is the compilation mode train_model = theano.function( [theano.Param(features, default=np.zeros(NNET_INPUT_DIMENSION)), theano.Param(action, default=0), theano.Param(reward, default=0), theano.Param(next_features, default=np.zeros(NNET_INPUT_DIMENSION))], outputs=loss, updates=updates, mode='FAST_RUN') sym_action = qnetwork.get_action(features) get_action = theano.function([features], sym_action) # some containers for collecting information about the training processes rewards = [] losses = [] best_reward = 4 sequence_examples = [] sampled_examples = [] # the preprocessor and feature extractor to use preprocessor = screen_utils.RGBScreenPreprocessor() feature_extractor = feature_extractors.NNetOpenCVBoundingBoxExtractor(max_features=MAX_FEATURES) if use_replay_mem: replay_mem = ReplayMemory() # main training loop, each episode is a full playthrough of the game for episode in xrange(n_episodes): # this implements the frozen target component of the network # by setting the frozen layers of the network to a copy of the current layers if episode % frozen_target_update_period == 0: qnetwork.frozen_layers = copy.deepcopy(qnetwork.layers) # some variables for collecting information about this particular run of the game total_reward = 0 action = 1 counter = 0 reward = 0 loss = 0 previous_param_0 = None # lives here is used for the reward heuristic of subtracting 1 from the reward # when we lose a life. currently commented out this functionality because # i think it might not be helpful. lives = ale.lives() # the initial state of the screen and state screen = np.zeros((preprocessor.dim, preprocessor.dim, preprocessor.channels)) state = { "screen" : screen, "objects" : None, "prev_objects": None, "features": np.zeros(MAX_FEATURES)} # start the actual play through of the game while not ale.game_over(): counter += 1 # get the current features, which is the representation of the state provided to # the "agent" (here just the network directly) features = state["features"] # epsilon greedy action selection (note that exploration_prob is reduced by # reduce_exploration_prob_amount after every game) if random.random() < exploration_prob: action = random.choice(actions) else: # to choose an action from the network, we fprop # the current state and take the argmax of the output # layer (i.e., the action that corresponds to the # maximum q value) action = get_action(features) # take the action and receive the reward reward += ale.act(real_actions[action]) # this is commented out because i think it might not be helpful if ale.lives() < lives: lives = ale.lives() reward -= 1 # get the next screen, preprocess it, initialize the next state next_screen = ale.getScreenRGB() next_screen = preprocessor.preprocess(next_screen) next_state = {"screen": next_screen, "objects": None, "prev_objects": state["objects"]} # get the features for the next state next_features = feature_extractor(next_state, action=None) if use_replay_mem: sars_tuple = (features, action, reward, next_features) replay_mem.store(sars_tuple) num_samples = 5 if replay_mem.isFull() else 1 for i in range(0, num_samples): random_train_tuple = replay_mem.sample() loss += train_model(*random_train_tuple) # collect for pca sequence_examples.append(list(sars_tuple[0]) + [sars_tuple[1]] \ + [sars_tuple[2]] + sars_tuple[3]) sequence_examples = sequence_examples[-100:] sampled_examples.append(list(random_train_tuple[0]) + [random_train_tuple[1]] \ + [random_train_tuple[2]] + random_train_tuple[3]) sampled_examples = sampled_examples[-100:] else: # call the train model function loss += train_model(features, action, reward, next_features) # prepare for the next loop through the game next_state["features"] = next_features state = next_state # weird counter value to avoid interaction with any other counter # loop that might be added, not necessary right now if verbose and counter % PRINT_TRAINING_INFO_PERIOD == 0: print('*' * 15 + ' training information ' + '*' * 15) print('episode: {}'.format(episode)) print('reward: \t{}'.format(reward)) print('avg reward: \t{}'.format(np.mean(rewards))) print 'avg reward (last 25): \t{}'.format(np.mean(rewards[-NUM_EPISODES_AVERAGE_REWARD_OVER:])) print('action: \t{}'.format(real_actions[action])) print('exploration prob: {}'.format(exploration_prob)) param_info = [(p.eval(), p.name) for p in qnetwork.get_params()] for index, (val, name) in enumerate(param_info): if previous_param_0 is None and index == 0: previous_param_0 = val print('parameter {} value: \n{}'.format(name, val)) if index == 0: diff = val - previous_param_0 print('difference from previous param {}: \n{}'.format(name, diff)) print('features: \t{}'.format(features)) print('next_features: \t{}'.format(next_features)) scaled_sequence = preprocessing.scale(np.array(sequence_examples)) scaled_sampled = preprocessing.scale(np.array(sampled_examples)) pca = PCA() _ = pca.fit_transform(scaled_sequence) print('variance explained by first component for sequence: {}%'.format(pca. \ explained_variance_ratio_[0] * 100)) _ = pca.fit_transform(scaled_sampled) print('variance explained by first component for sampled: {}%'.format(pca. \ explained_variance_ratio_[0] * 100)) print('*' * 52) print('\n') # collect info and total reward and also reset the reward to 0 if we reach this point total_reward += reward reward = 0 # collect stats from this game run losses.append(loss) rewards.append(total_reward) # if we got a best reward, inform the user if total_reward > best_reward: best_reward = total_reward print("best reward!: {}".format(total_reward)) # record the weights if record_weights=True # must record the weights of the indiviual layers # only save hidden layers b/c output layer does not have weights if episode != 0 and episode % RECORD_WEIGHTS_PERIOD == 0 and record_weights: file_utils.save_rewards(rewards) file_utils.save_model(qnetwork.layers[0], 'weights/hidden0_{}.pkl'.format(episode)) file_utils.save_model(qnetwork.layers[1], 'weights/hidden1_{}.pkl'.format(episode)) # reduce exploration policy over time if exploration_prob > MINIMUM_EXPLORATION_EPSILON: exploration_prob -= reduce_exploration_prob_amount # inform user of how the episode went and reset the game print('episode: {} ended with score: {}\tloss: {}'.format(episode, rewards[-1], losses[-1])) ale.reset_game() # return the list of rewards attained return rewards
class GameState(object): def __init__(self, rand_seed, display=False, no_op_max=7): self.ale = ALEInterface() self.ale.setInt('random_seed', rand_seed) self._no_op_max = no_op_max if display: self._setup_display() self.ale.loadROM(ROM) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() # height=210, width=160 self._screen = np.empty((210, 160, 1), dtype=np.uint8) self.reset() def _process_frame(self, action, reshape): reward = self.ale.act(action) terminal = self.ale.game_over() # screen shape is (210, 160, 1) self.ale.getScreenGrayscale(self._screen) # reshape it into (210, 160) reshaped_screen = np.reshape(self._screen, (210, 160)) # resize to height=110, width=84 resized_screen = cv2.resize(reshaped_screen, (84, 110)) x_t = resized_screen[18:102,:] if reshape: x_t = np.reshape(x_t, (84, 84, 1)) x_t = x_t.astype(np.float32) x_t *= (1.0/255.0) return reward, terminal, x_t def _setup_display(self): if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) def reset(self): self.ale.reset_game() # randomize initial state if self._no_op_max > 0: no_op = np.random.randint(0, self._no_op_max + 1) for _ in range(no_op): self.ale.act(0) _, _, x_t = self._process_frame(0, False) self.reward = 0 self.terminal = False self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) def process(self, action): # convert original 18 action index to minimal action set index real_action = self.real_actions[action] r, t, x_t1 = self._process_frame(real_action, True) self.reward = r self.terminal = t self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2) def update(self): self.s_t = self.s_t1
class ALE(object): def __init__(self, init_seed, init_rand): self.ale = ALEInterface() self.ale.setInt(b'random_seed', init_seed) self.ale.setBool('display_screen', False) self.ale.setBool('sound', False) self.ale.setFloat(b'repeat_action_probability', 0.0) self.ale.loadROM('./breakout.bin') self.action_size = 4 self._screen = None self.reward = 0 self.terminal = True self.init_rand = init_rand #def setSetting(self, action_repeat, random_init_step, screen_type): def setSetting(self, action_repeat, screen_type): self.action_repeat = action_repeat self.screen_type = screen_type #self.random_init_step = random_init_step def _step(self, action): # debug transform if action == 2: action = 4 self.reward = self.ale.act(action) self.terminal = self.ale.game_over() if self.screen_type == 0: self._screen = self.ale.getScreenRGB() elif self.screen_type == 1: self._screen = self.ale.getScreenGrayscale() else: sys.stderr.write('screen_type error!') exit() def state(self): return self.reward, self.screen, self.terminal def act(self, action): cumulated_reward = 0 for _ in range(self.action_repeat): self._step(action) cumulated_reward += self.reward if self.terminal: break self.reward = cumulated_reward return self.state() def train_act(self, action): cumulated_reward = 0 for _ in range(self.action_repeat): self._step(action) cumulated_reward += self.reward if self.terminal: break self.reward = cumulated_reward return (self._screen, self.state()) def new_game(self): if self.ale.game_over(): self.ale.reset_game() if self.screen_type == 0: self._screen = self.ale.getScreenRGB() elif self.screen_type == 1: self._screen = self.ale.getScreenGrayscale() else: sys.stderr.write('screen_type error!') exit() self._step(0) #for _ in range(random.randint(0, self.random_init_step - 1)): for _ in range(self.init_rand): self._step(0) return self.screen @property def screen(self): return cv2.resize( cv2.cvtColor(self._screen, cv2.COLOR_RGB2GRAY) / 255., (84, 84))
class ALEEnvironment(Environment): def __init__(self, rom_file, args): from ale_python_interface import ALEInterface self.ale = ALEInterface() if args.display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) if args.random_seed: self.ale.setInt('random_seed', args.random_seed) if args.record_screen_path: if not os.path.exists(args.record_screen_path): logger.info("Creating folder %s" % args.record_screen_path) os.makedirs(args.record_screen_path) logger.info("Recording screens to %s", args.record_screen_path) self.ale.setString('record_screen_dir', args.record_screen_path) if args.record_sound_filename: logger.info("Recording sound to %s", args.record_sound_filename) self.ale.setBool('sound', True) self.ale.setString('record_sound_filename', args.record_sound_filename) self.ale.loadROM(rom_file) if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() logger.info("Using minimal action set with size %d" % len(self.actions)) else: self.actions = self.ale.getLegalActionSet() logger.info("Using full action set with size %d" % len(self.actions)) logger.debug("Actions: " + str(self.actions)) self.screen_width = args.screen_width self.screen_height = args.screen_height self.life_lost = False def numActions(self): return len(self.actions) def restart(self): # In test mode, the game is simply initialized. In train mode, if the game # is in terminal state due to a life loss but not yet game over, then only # life loss flag is reset so that the next game starts from the current # state. Otherwise, the game is simply initialized. if (self.mode == 'test' or not self.life_lost or # `reset` called in a middle of episode self.ale.game_over() # all lives are lost ): self.ale.reset_game() self.life_lost = False def act(self, action): lives = self.ale.lives() reward = self.ale.act(self.actions[action]) self.life_lost = (not lives == self.ale.lives()) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return resized def isTerminal(self): if self.mode == 'train': return self.ale.game_over() or self.life_lost return self.ale.game_over()
class ALE_Environment(EnvironmentBase): """ Environment Specifications: Number of Actions = 18 Original Frame Dimensions = 210 x 160 Frame Dimensions = 84 x 84 Frame Data Type = np.uint8 Reward = Game Score Summary Name: frames_per_episode """ def __init__(self, config, games_directory=None, rom_filename=None, summary=None): super().__init__() """ Parameters: Name: Type Default: Description(omitted when self-explanatory): display_screen bool False Display game screen agent_render bool False Display current frame the way the agent sees it frame_skip int 4 See ALE Documentation repeat_action_probability float 0.25 in [0,1], see ALE Documentation max_num_frames int 18000 Max number of frames per episode color_averaging bool False If true, it averages over the skipped frames. Otherwise, it takes the maximum over the skipped frames. frame_stack int 4 Stack of frames for agent, see Mnih et. al. (2015) save_summary bool False Save the summary of the environment """ assert isinstance(config, Config) self.display_screen = check_attribute_else_default( config, 'display_screen', False) self.agent_render = check_attribute_else_default( config, 'agent_render', False) self.frame_skip = check_attribute_else_default(config, 'frame_skip', 4) self.repeat_action_probability = check_attribute_else_default( config, 'repeat_action_probability', 0.25) max_num_frames = check_attribute_else_default(config, 'max_num_frames', 18000) self.color_averaging = check_attribute_else_default( config, 'color_averaging', True) if self.color_averaging: self.aggregate_func = np.average else: self.aggregate_func = np.amax self.frame_stack = check_attribute_else_default( config, 'frame_stack', 4) self.save_summary = check_attribute_else_default( config, 'save_summary', False) if self.save_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, "frames_per_episode", []) " Environment variables" self.env = ALEInterface() self.env.setInt(b'frame_skip', 1) self.env.setInt(b'random_seed', 0) self.env.setFloat(b'repeat_action_probability', 0) self.env.setInt(b"max_num_frames_per_episode", max_num_frames) self.env.setBool(b"color_averaging", False) self.env.setBool(b'display_screen', self.display_screen) self.rom_file = str.encode(games_directory + rom_filename) self.frame_count = 0 " Loading ROM " self.env.loadROM(self.rom_file) """ Fixed Parameters: Frame Format: "NCHW" (batch_size, channels, height, width). Decided to adopt this format because it's the fastest to process in tensorflow with a gpu. Frame Height and Width: 84, the default value in the literature. """ " Inner state of the environment " self.height = 84 self.width = 84 self.current_state = np.zeros( [self.frame_stack, self.height, self.width], dtype=np.uint8) self.original_height = 210 self.original_width = 160 self.history = np.zeros( [self.frame_skip, self.original_height, self.original_width], np.uint8) self.reset() self.observations_dimensions = self.current_state.shape self.frame_dims = self.current_state[0].shape self.actions = self.env.getLegalActionSet() self.previous_action = 0 def reset(self): if self.save_summary and (self.frame_count != 0): self.summary['frames_per_episode'].append(self.frame_count) self.env.reset_game() self.frame_count = 0 original_frame = np.squeeze(self.env.getScreenGrayscale()) self.history[-1] = original_frame fixed_state = self.fix_state() self.current_state[-1] = fixed_state self.previous_action = 0 # self.agent_state_display() # For debugging purposes def add_frame(self, frame): self.current_state[:-1] = self.current_state[1:] self.current_state[-1] = frame def update(self, action): reward = 0 for _ in range(self.frame_skip): if not self.env.game_over(): p = np.random.rand() current_action = self.previous_action if p <= self.repeat_action_probability else action reward += self.env.act(current_action) self.history[:-1] = self.history[1:] self.history[-1] = np.squeeze(self.env.getScreenGrayscale()) self.frame_count += 1 new_frame = self.fix_state() self.add_frame(new_frame) terminal = self.env.game_over() self.previous_action = action # self.agent_state_display() # For debugging purposes only return self.current_state, reward, terminal def fix_state(self): agg_state = self.aggregate_func(self.history, axis=0) fixed_agg_state = resize(agg_state, (self.height, self.width), mode='constant', preserve_range=True) fixed_agg_state = np.array(fixed_agg_state, dtype=np.uint8) return fixed_agg_state def agent_state_display(self): if self.agent_render: state = self.current_state[-1] plt.imshow(state) plt.pause(0.05) " Getters " def get_current_state(self): return self.current_state def get_state_for_er_buffer(self): return self.current_state[-1] def get_num_actions(self): return 18 " Setters " def set_render(self, display_screen=False): self.env.setBool(b'display_screen', display_screen) self.env.loadROM(self.rom_file)
class Agent(object): def __init__(self): self._ale = ALEInterface() self._ale.setInt('random_seed', 123) self._ale.setFloat('repeat_action_probability', 0.0) self._ale.setBool('color_averaging', False) self._ale.loadROM('roms/enduro.bin') self._controller = Controller(self._ale) self._extractor = StateExtractor(self._ale) self._image = None self._speed_range = 50 def run(self, learn, episodes=1, draw=False): """ Implements the playing/learning loop. Args: learn(bool): Whether the self.learn() function should be called. episodes (int): The number of episodes to run the agent for. draw (bool): Whether to overlay the environment state on the frame. Returns: None """ for e in range(episodes): self._relative_speed = -self._speed_range # Observe the environment to set the initial state (road, cars, grid, self._image) = self._extractor.run(draw=draw, scale=4.0) self.initialise(road, cars, self._relative_speed, grid) num_frames = self._ale.getFrameNumber() # Each episode lasts 6500 frames while self._ale.getFrameNumber() - num_frames < 6500: # Take an action self.act() # Update the environment grid (road, cars, grid, self._image) = self._extractor.run(draw=draw, scale=4.0) if self.collision(cars): self._relative_speed = -self._speed_range self.sense(road, cars, self._relative_speed, grid) # Perform learning if required if learn: self.learn() self.callback(learn, e + 1, self._ale.getFrameNumber() - num_frames) self._ale.reset_game() def collision(self, cars): if not cars['others']: return False x, y, _, _ = cars['self'] min_dist = sys.float_info.max min_angle = 0. for c in cars['others']: cx, cy, _, _ = c dist = np.sqrt((cx - x)**2 + (cy - y)**2) if dist < min_dist: min_dist = dist min_angle = np.arctan2(y - cy, cx - x) return min_dist < 18. and 0.1 * np.pi < min_angle and min_angle < 0.9 * np.pi def getActionsSet(self): """ Returns the set of all possible actions """ return [Action.ACCELERATE, Action.RIGHT, Action.LEFT, Action.BRAKE] def move(self, action): """ Executes the action and advances the game to the next state. Args: action (int): The action which should executed. Make sure to use the constants returned by self.getActionsSet() Returns: int: The obtained reward after executing the action """ if action == Action.ACCELERATE: self._relative_speed = min(self._relative_speed + 1, self._speed_range) elif action == Action.BRAKE: self._relative_speed = max(self._relative_speed - 1, -self._speed_range) return self._controller.move(action) def initialise(self, road, cars, speed, grid): """ Called at the beginning of each episode, mainly used for state initialisation. For more information on the arguments have a look at the README.md Args: road: 2-dimensional array containing [x, y] points in pixel coordinates of the road grid cars: dictionary which contains the location and the size of the agent and the opponents in pixel coordinates speed: the relative speed of the agent with respect the others gird: 2-dimensional numpy array containing the latest grid representation of the environment Returns: None """ raise NotImplementedError def act(self): """ Called at each loop iteration to choose and execute an action. Returns: None """ raise NotImplementedError def sense(self, road, cars, speed, grid): """ Called at each loop iteration to construct the new state from the update environment grid. For more information on the arguments have a look at the README.md Args: road: 2-dimensional array containing [x, y] points in pixel coordinates of the road grid cars: dictionary which contains the location and the size of the agent and the opponents in pixel coordinates speed: the relative speed of the agent with respect the others gird: 2-dimensional numpy array containing the latest grid representation of the environment Returns: None """ raise NotImplementedError def learn(self): """ Called at each loop iteration when the agent is learning. It should implement the learning procedure. Returns: None """ raise NotImplementedError def callback(self, learn, episode, iteration): """ Called at each loop iteration mainly for reporting purposes. Args: learn (bool): Indicates whether the agent is learning or not. episode (int): The number of the current episode. iteration (int): The number of the current iteration. Returns: None """ raise NotImplementedError
class AtariPlayer(RLEnvironment): """ A wrapper for atari emulator. NOTE: will automatically restart when a real episode ends """ def __init__(self, rom_file, viz=0, height_range=(None,None), frame_skip=4, image_shape=(84, 84), nullop_start=30, live_lost_as_eoe=True): """ :param rom_file: path to the rom :param frame_skip: skip every k frames and repeat the action :param image_shape: (w, h) :param height_range: (h1, h2) to cut :param viz: visualization to be done. Set to 0 to disable. Set to a positive number to be the delay between frames to show. Set to a string to be a directory to store frames. :param nullop_start: start with random number of null ops :param live_losts_as_eoe: consider lost of lives as end of episode. useful for training. """ super(AtariPlayer, self).__init__() if not os.path.isfile(rom_file) and '/' not in rom_file: rom_file = get_dataset_dir('atari_rom', rom_file) assert os.path.isfile(rom_file), \ "rom {} not found. Please download at {}".format(rom_file, ROM_URL) try: ALEInterface.setLoggerMode(ALEInterface.Logger.Warning) except AttributeError: log_once() # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86 with _ALE_LOCK: self.ale = ALEInterface() self.rng = get_rng(self) self.ale.setInt(b"random_seed", self.rng.randint(0, 10000)) self.ale.setBool(b"showinfo", False) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b'color_averaging', False) # manual.pdf suggests otherwise. self.ale.setFloat(b'repeat_action_probability', 0.0) # viz setup if isinstance(viz, six.string_types): assert os.path.isdir(viz), viz self.ale.setString(b'record_screen_dir', viz) viz = 0 if isinstance(viz, int): viz = float(viz) self.viz = viz if self.viz and isinstance(self.viz, float): self.windowname = os.path.basename(rom_file) cv2.startWindowThread() cv2.namedWindow(self.windowname) self.ale.loadROM(rom_file.encode('utf-8')) self.width, self.height = self.ale.getScreenDims() self.actions = self.ale.getMinimalActionSet() self.live_lost_as_eoe = live_lost_as_eoe self.frame_skip = frame_skip self.nullop_start = nullop_start self.height_range = height_range self.image_shape = image_shape self.current_episode_score = StatCounter() self.restart_episode() def _grab_raw_image(self): """ :returns: the current 3-channel image """ m = self.ale.getScreenRGB() return m.reshape((self.height, self.width, 3)) def current_state(self): """ :returns: a gray-scale (h, w, 1) float32 image """ ret = self._grab_raw_image() # max-pooled over the last screen ret = np.maximum(ret, self.last_raw_screen) if self.viz: if isinstance(self.viz, float): #m = cv2.resize(ret, (1920,1200)) cv2.imshow(self.windowname, ret) time.sleep(self.viz) ret = ret[self.height_range[0]:self.height_range[1],:].astype('float32') # 0.299,0.587.0.114. same as rgb2y in torch/image ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY) ret = cv2.resize(ret, self.image_shape) ret = np.expand_dims(ret, axis=2) return ret def get_action_space(self): return DiscreteActionSpace(len(self.actions)) def restart_episode(self): if self.current_episode_score.count > 0: self.stats['score'].append(self.current_episode_score.sum) self.current_episode_score.reset() self.ale.reset_game() # random null-ops start n = self.rng.randint(self.nullop_start) self.last_raw_screen = self._grab_raw_image() for k in range(n): if k == n - 1: self.last_raw_screen = self._grab_raw_image() self.ale.act(0) def action(self, act): """ :param act: an index of the action :returns: (reward, isOver) """ oldlives = self.ale.lives() r = 0 for k in range(self.frame_skip): if k == self.frame_skip - 1: self.last_raw_screen = self._grab_raw_image() r += self.ale.act(self.actions[act]) newlives = self.ale.lives() if self.ale.game_over() or \ (self.live_lost_as_eoe and newlives < oldlives): break self.current_episode_score.feed(r) isOver = self.ale.game_over() if isOver: self.restart_episode() if self.live_lost_as_eoe: isOver = isOver or newlives < oldlives return (r, isOver)
class ALEEnvironment(Environment): def __init__(self, rom_file, args): from ale_python_interface import ALEInterface self.ale = ALEInterface() if args.display_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.ale.setInt('frame_skip', args.frame_skip) self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) if args.random_seed: self.ale.setInt('random_seed', args.random_seed) if args.record_screen_path: if not os.path.exists(args.record_screen_path): logger.info("Creating folder %s" % args.record_screen_path) os.makedirs(args.record_screen_path) logger.info("Recording screens to %s", args.record_screen_path) self.ale.setString('record_screen_dir', args.record_screen_path) if args.record_sound_filename: logger.info("Recording sound to %s", args.record_sound_filename) self.ale.setBool('sound', True) self.ale.setString('record_sound_filename', args.record_sound_filename) self.ale.loadROM(rom_file) if args.minimal_action_set: self.actions = self.ale.getMinimalActionSet() logger.info("Using minimal action set with size %d" % len(self.actions)) else: self.actions = self.ale.getLegalActionSet() logger.info("Using full action set with size %d" % len(self.actions)) logger.debug("Actions: " + str(self.actions)) self.screen_width = args.screen_width self.screen_height = args.screen_height self.life_lost = False def numActions(self): return len(self.actions) def restart(self): # In test mode, the game is simply initialized. In train mode, if the game # is in terminal state due to a life loss but not yet game over, then only # life loss flag is reset so that the next game starts from the current # state. Otherwise, the game is simply initialized. if ( self.mode == 'test' or not self.life_lost or # `reset` called in a middle of episode self.ale.game_over() # all lives are lost ): self.ale.reset_game() self.life_lost = False def act(self, action): lives = self.ale.lives() reward = self.ale.act(self.actions[action]) self.life_lost = (not lives == self.ale.lives()) return reward def getScreen(self): screen = self.ale.getScreenGrayscale() resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return resized def isTerminal(self): if self.mode == 'train': return self.ale.game_over() or self.life_lost return self.ale.game_over()
class AtariGame(Task): ''' RL task based on Arcade Game. ''' def __init__(self, rom_path, num_frames=4, live=False, skip_frame=0, mode='normal'): self.ale = ALEInterface() if live: USE_SDL = True if USE_SDL: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.mode = mode self.live = live self.ale.loadROM(rom_path) self.num_frames = num_frames self.frames = [] self.frame_id = 0 self.cum_reward = 0 self.skip_frame = skip_frame if mode == 'small': img = T.matrix('img') self.max_pool = theano.function([img], max_pool_2d(img, [4, 4])) self.img_shape = (16, 16) else: self.img_shape = (84, 84) # image shape according to DQN Nature paper. while len(self.frames) < 4: self.step(choice(self.valid_actions, 1)[0]) self.reset() def copy(self): import dill as pickle return pickle.loads(pickle.dumps(self)) def reset(self): self.ale.reset_game() self.frame_id = 0 self.cum_reward = 0 if self.skip_frame: for frame_i in range(self.skip_frame): self.step(choice(self.valid_actions, 1)[0]) @property def _curr_frame(self): img = self.ale.getScreenRGB() img = rgb2yuv(img)[:, :, 0] # get Y channel, according to Nature paper. # print 'RAM', self.ale.getRAM() if self.mode == 'small': img = self.max_pool(img) img = imresize(img, self.img_shape, interp='bicubic') return img @property def curr_state(self): ''' return raw pixels. ''' return np.array(self.frames, dtype=floatX) / floatX(255.) # normalize @property def state_shape(self): return self.curr_state.shape @property def num_actions(self): return len(self.valid_actions) @property def valid_actions(self): return self.ale.getLegalActionSet() def step(self, action): reward = self.ale.act(action) if len(self.frames) == self.num_frames: self.frames = self.frames[1:] self.frames.append(self._curr_frame) self.frame_id += 1 #print 'frame_id', self.frame_id self.cum_reward += reward return reward # TODO: scale the gradient up. def is_end(self): if np.abs(self.cum_reward) > 0: return True return self.ale.game_over() def visualize(self, fig=1, fname=None, format='png'): import matplotlib.pyplot as plt fig = plt.figure(fig, figsize=(5,5)) plt.clf() plt.axis('off') #res = plt.imshow(self.ale.getScreenRGB()) res = plt.imshow(self._curr_frame, interpolation='none') if fname: plt.savefig(fname, format=format) else: plt.show() return res
class AtariEnvironment: def __init__(self, args, outputDir): self.outputDir = outputDir self.screenCaptureFrequency = args.screen_capture_freq self.ale = ALEInterface() self.ale.setInt(b'random_seed', 123456) random.seed(123456) # Fix https://groups.google.com/forum/#!topic/deep-q-learning/p4FAIaabwlo self.ale.setFloat(b'repeat_action_probability', 0.0) # Load the ROM file self.ale.loadROM(args.rom) self.actionSet = self.ale.getMinimalActionSet() self.gameNumber = 0 self.stepNumber = 0 self.resetGame() def getNumActions(self): return len(self.actionSet) def getState(self): return self.state def getGameNumber(self): return self.gameNumber def getFrameNumber(self): return self.ale.getFrameNumber() def getEpisodeFrameNumber(self): return self.ale.getEpisodeFrameNumber() def getEpisodeStepNumber(self): return self.episodeStepNumber def getStepNumber(self): return self.stepNumber def getGameScore(self): return self.gameScore def isGameOver(self): return self.ale.game_over() def step(self, action): previousLives = self.ale.lives() reward = 0 isTerminal = 0 self.stepNumber += 1 self.episodeStepNumber += 1 for i in range(4): prevScreenRGB = self.ale.getScreenRGB() reward += self.ale.act(self.actionSet[action]) screenRGB = self.ale.getScreenRGB() # Detect end of episode, I don't think I'm handling this right in terms # of the overall game loop (??) if self.ale.lives() < previousLives or self.ale.game_over(): isTerminal = 1 break if self.gameNumber % self.screenCaptureFrequency == 0: dir = self.outputDir + '/screen_cap/game-%06d' % (self.gameNumber) if not os.path.isdir(dir): os.makedirs(dir) self.ale.saveScreenPNG(dir + '/frame-%06d.png' % (self.getEpisodeFrameNumber())) maxedScreen = np.maximum(screenRGB, prevScreenRGB) self.state = self.state.stateByAddingScreen(maxedScreen, self.ale.getFrameNumber()) self.gameScore += reward return reward, self.state, isTerminal def resetGame(self): if self.ale.game_over(): self.gameNumber += 1 self.ale.reset_game() self.state = State().stateByAddingScreen(self.ale.getScreenRGB(), self.ale.getFrameNumber()) self.gameScore = 0 self.episodeStepNumber = 0 # environment steps vs ALE frames. Will probably be 4*frame number
trainStart = False episode = 0 t0 = time.time() cumulativeReward = 0.0 cost_average = 0.0 moving_average = 0.0 lastMemoryPointerPosition = 0 frameCount = 0 frameCountLast = frameCount terminal = 0 testFlag = False ale.reset_game() trainStart = False trainThread = threading.Thread(target=train) trainThread.start() # for frameCount in xrange(maxFrame): life0 = ale.lives() # rate = explorationRate if not testFlag else testExplorationRate # perceive if np.random.rand(1) >explorationRate: actionIndex = forward(memory.History,sess,Q_train) # actionIndex = np.argmax(sess.run(Q_train.y, feed_dict={Q_train.x_image: [memory.History]}),axis=1)
def main(): pygame.init() ale = ALEInterface() ale.setInt(b'random_seed', 123) ale.setBool(b'display_screen', True) ale.setInt(b'frame_skip', 4) # ale.setFloat(b'repeat_action_probability', .7) # ale.setBool(b'color_averaging', True) game = 'breakout' #ACKTR tasks#, 'space_invaders', 'seaquest', 'qbert', 'pong', 'beam_rider', 'breakout' rom = home + '/Documents/ALE/roms/supported/' + game + '.bin' ale.loadROM(str.encode(rom)) legal_actions = ale.getLegalActionSet() rewards, num_episodes = [], 5 config = [] agent = DQN_agent(config) for episode in range(num_episodes): total_reward = 0 exp_state = [] exp_action = 0 exp_reward = 0 exp_next_state = [] while not ale.game_over(): #Save frame frame = ale.getScreenGrayscale() frame = cv2.resize(frame, (84, 84)) exp_next_state.append(frame) #Make action action = random.choice(legal_actions) reward = ale.act(action) total_reward += reward exp_reward += exp_reward #Make experience if len(exp_next_state) == 4: state_ready = np.reshape(np.stack(exp_next_state), [4 * 84, 84]) # cv2.imshow('image',state_ready) # cv2.waitKey(0) exp_action = action if len(exp_state) == 0: exp_state = exp_next_state else: experience = [ exp_state, exp_action, exp_reward, exp_next_state ] exp_reward = 0 exp_state = exp_next_state exp_next_state = [] print('Episode %d reward %d.' % (episode, total_reward)) rewards.append(total_reward) ale.reset_game() average = sum(rewards) / len(rewards) print('Average for %d episodes: %d' % (num_episodes, average))
class Environment: """docstring for Environment""" BUFFER_LEN = 2 EPISODE_FRAMES = 18000 EPOCH_COUNT = 200 EPOCH_STEPS = 250000 EVAL_EPS = 0.001 FRAMES_SKIP = 4 FRAME_HEIGHT = 84 FRAME_WIDTH = 84 MAX_NO_OP = 30 MAX_REWARD = 1 def __init__(self, rom_name, rng, display_screen = False): self.api = ALEInterface() self.api.setInt('random_seed', rng.randint(333)) self.api.setBool('display_screen', display_screen) self.api.setFloat('repeat_action_probability', 0.0) self.rom_name = rom_name self.display_screen = display_screen self.rng = rng self.repeat = Environment.FRAMES_SKIP self.buffer_len = Environment.BUFFER_LEN self.height = Environment.FRAME_HEIGHT self.width = Environment.FRAME_WIDTH self.episode_steps = Environment.EPISODE_FRAMES / Environment.FRAMES_SKIP self.merge_id = 0 self.max_reward = Environment.MAX_REWARD self.eval_eps = Environment.EVAL_EPS self.log_dir = '' self.network_dir = '' self.api.loadROM('../rom/' + self.rom_name) self.minimal_actions = self.api.getMinimalActionSet() original_width, original_height = self.api.getScreenDims() self.merge_frame = np.zeros((self.buffer_len , original_height , original_width) , dtype = np.uint8) def get_action_count(self): return len(self.minimal_actions) def train(self, agent, store_freq, folder = None, start_epoch = 0): self._open_log_files(agent, folder) obs = np.zeros((self.height, self.width), dtype = np.uint8) epoch_count = Environment.EPOCH_COUNT for epoch in xrange(start_epoch, epoch_count): self.need_reset = True steps_left = Environment.EPOCH_STEPS print "\n" + "=" * 50 print "Epoch #%d" % (epoch + 1) episode = 0 train_start = time.time() while steps_left > 0: num_step, _ = self._run_episode(agent, steps_left, obs) steps_left -= num_step episode += 1 if steps_left == 0 or episode % 10 == 0: print "Finished episode #%d, steps_left = %d" \ % (episode, steps_left) train_end = time.time() valid_values = agent.get_validate_values() eval_values = self.evaluate(agent) test_end = time.time() train_time = train_end - train_start test_time = test_end - train_end step_per_sec = Environment.EPOCH_STEPS * 1. / max(1, train_time) print "\tFinished epoch #%d, episode trained = %d\n" \ "\tValidate values = %.3f, evaluate reward = %.3f\n"\ "\tTrain time = %.0fs, test time = %.0fs, steps/sec = %.4f" \ % (epoch + 1, episode, valid_values, eval_values\ , train_time, test_time, step_per_sec) self._update_log_files(agent, epoch + 1, episode , valid_values, eval_values , train_time, test_time , step_per_sec, store_freq) gc.collect() def evaluate(self, agent, episodes = 30, obs = None): print "\n***Start evaluating" if obs is None: obs = np.zeros((self.height, self.width), dtype = np.uint8) sum_reward = 0.0 sum_step = 0.0 for episode in xrange(episodes): self.need_reset = True step, reward = self._run_episode(agent, self.episode_steps, obs , self.eval_eps, evaluating = True) sum_reward += reward sum_step += step print "Finished episode %d, reward = %d, step = %d" \ % (episode + 1, reward, step) self.need_reset = True print "Average reward per episode = %.4f" % (sum_reward / episodes) print "Average step per episode = %.4f" % (sum_step / episodes) return sum_reward / episodes def _prepare_game(self): if self.need_reset or self.api.game_over(): self.api.reset_game() self.need_reset = False if Environment.MAX_NO_OP > 0: num_no_op = self.rng.randint(Environment.MAX_NO_OP + 1) \ + self.buffer_len for _ in xrange(num_no_op): self.api.act(0) for _ in xrange(self.buffer_len): self._update_buffer() def _run_episode(self, agent, steps_left, obs , eps = 0.0, evaluating = False): self._prepare_game() start_lives = self.api.lives() step_count = 0 sum_reward = 0 is_terminal = False while step_count < steps_left and not is_terminal: self._get_screen(obs) action_id, _ = agent.get_action(obs, eps, evaluating) reward = self._repeat_action(self.minimal_actions[action_id]) reward_clip = reward if self.max_reward > 0: reward_clip = np.clip(reward, -self.max_reward, self.max_reward) life_lost = not evaluating and self.api.lives() < start_lives is_terminal = self.api.game_over() or life_lost \ or step_count + 1 >= steps_left agent.add_experience(obs, is_terminal, action_id, reward_clip , evaluating) sum_reward += reward step_count += 1 return step_count, sum_reward def _update_buffer(self): self.api.getScreenGrayscale(self.merge_frame[self.merge_id, ...]) self.merge_id = (self.merge_id + 1) % self.buffer_len def _repeat_action(self, action): reward = 0 for i in xrange(self.repeat): reward += self.api.act(action) if i + self.buffer_len >= self.repeat: self._update_buffer() return reward def _get_screen(self, resized_frame): self._resize_frame(self.merge_frame.max(axis = 0), resized_frame) def _resize_frame(self, src_frame, dst_frame): cv2.resize(src = src_frame, dst = dst_frame, dsize = (self.width, self.height), interpolation = cv2.INTER_LINEAR) def _open_log_files(self, agent, folder): time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) base_rom_name = os.path.splitext(os.path.basename(self.rom_name))[0] if folder is not None: self.log_dir = folder self.network_dir = self.log_dir + '/network' else: self.log_dir = '../run_results/' + base_rom_name + time_str self.network_dir = self.log_dir + '/network' info_name = get_next_name(self.log_dir, 'info', 'txt') git_name = get_next_name(self.log_dir, 'git-diff', '') try: os.stat(self.log_dir) except OSError: os.makedirs(self.log_dir) try: os.stat(self.network_dir) except OSError: os.makedirs(self.network_dir) with open(os.path.join(self.log_dir, info_name), 'w') as f: f.write('Commit: ' + subprocess.check_output(['git', 'rev-parse' , 'HEAD'])) f.write('Run command: ') f.write(' '.join(pipes.quote(x) for x in sys.argv)) f.write('\n\n') f.write(agent.get_info()) write_info(f, Environment) write_info(f, agent.__class__) write_info(f, agent.network.__class__) # From https://github.com/spragunr/deep_q_rl/pull/49/files with open(os.path.join(self.log_dir, git_name), 'w') as f: f.write(subprocess.check_output(['git', 'diff', 'HEAD'])) if folder is not None: return with open(os.path.join(self.log_dir, 'results.csv'), 'w') as f: f.write("epoch,episode_train,validate_values,evaluate_reward"\ ",train_time,test_time,steps_per_second\n") mem = psutil.virtual_memory() with open(os.path.join(self.log_dir, 'memory.csv'), 'w') as f: f.write("epoch,available,free,buffers,cached"\ ",available_readable,used_percent\n") f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (0, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) def _update_log_files(self, agent, epoch, episode, valid_values , eval_values, train_time, test_time, step_per_sec , store_freq): print "Updating log files" with open(self.log_dir + '/results.csv', 'a') as f: f.write("%d,%d,%.4f,%.4f,%d,%d,%.4f\n" % \ (epoch, episode, valid_values, eval_values , train_time, test_time, step_per_sec)) mem = psutil.virtual_memory() with open(self.log_dir + '/memory.csv', 'a') as f: f.write("%d,%d,%d,%d,%d,%s,%.1f\n" % \ (epoch, mem.available, mem.free, mem.buffers, mem.cached , bytes2human(mem.available), mem.percent)) agent.dump_network(self.network_dir + ('/%03d' % (epoch)) + '.npz') if (store_freq >= 0 and epoch >= Environment.EPOCH_COUNT) or \ (store_freq > 0 and (epoch % store_freq == 0)): agent.dump_exp(self.network_dir + '/exp.npz') def _setup_record(self, network_file): file_name, _ = os.path.splitext(os.path.basename(network_file)) time_str = time.strftime("_%m-%d-%H-%M", time.localtime()) img_dir = os.path.dirname(network_file) + '/images_' \ + file_name + time_str rom_name, _ = os.path.splitext(self.rom_name) out_name = os.path.dirname(network_file) + '/' + rom_name + '_' \ + file_name + time_str + '.mov' print out_name try: os.stat(img_dir) except OSError: os.makedirs(img_dir) self.api.setString('record_screen_dir', img_dir) self.api.loadROM('../rom/' + self.rom_name) return img_dir, out_name def record_run(self, agent, network_file, episode_id = 1): if episode_id > 1: self.evaluate(agent, episode_id - 1) system_state = self.api.cloneSystemState() img_dir, out_name = self._setup_record(network_file) if episode_id > 1: self.api.restoreSystemState(system_state) self.evaluate(agent, 1) script = \ """ { ffmpeg -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } || { avconv -r 60 -i %s/%%06d.png -f mov -c:v libx264 %s } """ % (img_dir, out_name, img_dir, out_name) os.system(script)
class AtariEmulator: def __init__(self, args): ''' Initialize Atari environment ''' # Parameters self.buffer_length = args.buffer_length self.screen_dims = args.screen_dims self.frame_skip = args.frame_skip self.blend_method = args.blend_method self.reward_processing = args.reward_processing self.max_start_wait = args.max_start_wait self.history_length = args.history_length self.start_frames_needed = self.buffer_length - 1 + ((args.history_length - 1) * self.frame_skip) #Initialize ALE instance self.ale = ALEInterface() self.ale.setFloat(b'repeat_action_probability', 0.0) if args.watch: self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) self.ale.loadROM(str.encode(args.rom_path + '/' + args.game + '.bin')) self.buffer = np.empty((self.buffer_length, 210, 160)) self.current = 0 self.action_set = self.ale.getMinimalActionSet() self.lives = self.ale.lives() self.reset() def get_possible_actions(self): ''' Return list of possible actions for game ''' return self.action_set def get_screen(self): ''' Add screen to frame buffer ''' self.buffer[self.current] = np.squeeze(self.ale.getScreenGrayscale()) self.current = (self.current + 1) % self.buffer_length def reset(self): self.ale.reset_game() self.lives = self.ale.lives() if self.max_start_wait < 0: print("ERROR: max start wait decreased beyond 0") sys.exit() elif self.max_start_wait <= self.start_frames_needed: wait = 0 else: wait = random.randint(0, self.max_start_wait - self.start_frames_needed) for _ in range(wait): self.ale.act(self.action_set[0]) # Fill frame buffer self.get_screen() for _ in range(self.buffer_length - 1): self.ale.act(self.action_set[0]) self.get_screen() # get initial_states state = [(self.preprocess(), 0, 0, False)] for step in range(self.history_length - 1): state.append(self.run_step(0)) # make sure agent hasn't died yet if self.isTerminal(): print("Agent lost during start wait. Decreasing max_start_wait by 1") self.max_start_wait -= 1 return self.reset() return state def run_step(self, action): ''' Apply action to game and return next screen and reward ''' raw_reward = 0 for step in range(self.frame_skip): raw_reward += self.ale.act(self.action_set[action]) self.get_screen() reward = None if self.reward_processing == 'clip': reward = np.clip(raw_reward, -1, 1) else: reward = raw_reward terminal = self.isTerminal() self.lives = self.ale.lives() return (self.preprocess(), action, reward, terminal, raw_reward) def preprocess(self): ''' Preprocess frame for agent ''' img = None if self.blend_method == "max": img = np.amax(self.buffer, axis=0) return cv2.resize(img, self.screen_dims, interpolation=cv2.INTER_LINEAR) def isTerminal(self): return (self.isGameOver() or (self.lives > self.ale.lives())) def isGameOver(self): return self.ale.game_over()
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, rom="ale/breakout.bin", frame_skip=4, ale_options=[{"key": "random_seed", "value": 0}, {"key": "color_averaging", "value": True}, {"key": "repeat_action_probability", "value": 0.}]): self._mode = -1 self._modeScore = 0.0 self._modeEpisodeCount = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._randomState = rng self._ale = ALEInterface() for option in ale_options: t = type(option["value"]) if t is int: self._ale.setInt(option["key"], option["value"]) elif t is float: self._ale.setFloat(option["key"], option["value"]) elif t is bool: self._ale.setBool(option["key"], option["value"]) else: raise ValueError("Option {} ({}) is not an int, bool or float.".format(option["key"], t)) self._ale.loadROM(rom) w, h = self._ale.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((84, 84), dtype=np.uint8) self._actions = self._ale.getMinimalActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._modeScore = 0.0 self._modeEpisodeCount = 0 else: self._modeEpisodeCount += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ale.reset_game() for _ in range(self._randomState.randint(15)): self._ale.act(0) self._ale.getScreenGrayscale(self._screen) cv2.resize(self._screen, (84, 84), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [84 * [84 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ale.act(action) if self.inTerminalState(): break self._ale.getScreenGrayscale(self._screen) cv2.resize(self._screen, (84, 84), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._modeScore += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._modeEpisodeCount += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._modeScore / self._modeEpisodeCount, self._modeEpisodeCount)) def inputDimensions(self): return [(4, 84, 84)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ale.game_over()
class AtariEmulator: def __init__(self, args): ''' Initialize Atari environment ''' # Parameters self.buffer_length = args.buffer_length self.screen_dims = args.screen_dims self.frame_skip = args.frame_skip self.blend_method = args.blend_method self.reward_processing = args.reward_processing self.max_start_wait = args.max_start_wait self.history_length = args.history_length self.start_frames_needed = self.buffer_length - 1 + ( (args.history_length - 1) * self.frame_skip) #Initialize ALE instance self.ale = ALEInterface() self.ale.setFloat(b'repeat_action_probability', 0.0) if args.watch: self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) print(args.rom_path) self.ale.loadROM(str.encode(args.rom_path + '/' + args.game + '.bin')) self.buffer = np.empty((self.buffer_length, 210, 160)) self.current = 0 self.action_set = self.ale.getMinimalActionSet() self.lives = self.ale.lives() self.reset() def get_possible_actions(self): ''' Return list of possible actions for game ''' return self.action_set def get_screen(self): ''' Add screen to frame buffer ''' self.buffer[self.current] = np.squeeze(self.ale.getScreenGrayscale()) self.current = (self.current + 1) % self.buffer_length def reset(self): self.ale.reset_game() self.lives = self.ale.lives() if self.max_start_wait < 0: print("ERROR: max start wait decreased beyond 0") sys.exit() elif self.max_start_wait <= self.start_frames_needed: wait = 0 else: wait = random.randint( 0, self.max_start_wait - self.start_frames_needed) for _ in range(wait): self.ale.act(self.action_set[0]) # Fill frame buffer self.get_screen() for _ in range(self.buffer_length - 1): self.ale.act(self.action_set[0]) self.get_screen() # get initial_states state = [(self.preprocess(), 0, 0, False)] for step in range(self.history_length - 1): state.append(self.run_step(0)) # make sure agent hasn't died yet if self.isTerminal(): print( "Agent lost during start wait. Decreasing max_start_wait by 1" ) self.max_start_wait -= 1 return self.reset() return state def run_step(self, action): ''' Apply action to game and return next screen and reward ''' raw_reward = 0 for step in range(self.frame_skip): raw_reward += self.ale.act(self.action_set[action]) self.get_screen() reward = None if self.reward_processing == 'clip': reward = np.clip(raw_reward, -1, 1) else: reward = raw_reward terminal = self.isTerminal() self.lives = self.ale.lives() return (self.preprocess(), action, reward, terminal, raw_reward) def preprocess(self): ''' Preprocess frame for agent ''' img = None if self.blend_method == "max": img = np.amax(self.buffer, axis=0) return cv2.resize(img, self.screen_dims, interpolation=cv2.INTER_LINEAR) def isTerminal(self): return (self.isGameOver() or (self.lives > self.ale.lives())) def isGameOver(self): return self.ale.game_over()
class agent(object): def __init__(self): self.ale = ALEInterface() # Get & Set the desired settings self.ale.setInt('random_seed', 123) # Set USE_SDL to true to display the screen. ALE must be compilied # with SDL enabled for this to work. On OSX, pygame init is used to # proxy-call SDL_main. USE_SDL = False if USE_SDL: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) # Load the ROM file self.ale.loadROM("ms_pacman.bin") #persistent: self.tetas = [] self.Q = self.txtToMap( 'qvalues.txt' ) #, a table of action values indexedby state and action, initially zero self.N = self.txtToMap( 'nvalues.txt' ) #, a table of frequenciesfor state-action pairs, initially zero self.s = None self.a = None self.r = 0 self.actions = self.ale.getMinimalActionSet() print self.actions #the previous state, action, and reward, initially null def Q_LEARNING_AGENT(self, state, reward): if self.ale.game_over(): self.updateQ(self.s, None, reward) if self.s is not None: self.incrementN(self.s, self.a) val = self.computeNewQ(self.s, self.a, self.r, state) self.updateQ(self.s, self.a, val) self.s = state self.a = self.chooseAct(state) self.r = reward return self.a def computeNewQ(self, s, a, reward, state): qsa = self.getQ(s, a) maxQ = self.getQ(state, self.actions[0]) for act in self.actions: val = self.getQ(state, act) if val > maxQ: maxQ = val n = self.getN(s, a) alp = self.alpha(n) v = qsa + alp * (reward + 0.9 * maxQ - qsa) return v def chooseAct(self, state): v = randrange(10) if v == 5: return self.actions[randrange(len(self.actions))] a = self.actions[0] maxQ = self.getQ(state, self.actions[0]) for act in self.actions: val = self.getQ(state, act) if val > maxQ: maxQ = val a = act return a def alpha(self, Nsa): return 0.9 def updateQ(self, s, a, value): self.Q[str(s) + "/" + str(a)] = value def getQ(self, s, a): return self.Q.get(str(s) + "/" + str(a), 0) def incrementN(self, s, a): self.N[str(s) + "/" + str(a)] = self.getN(s, a) + 1 def getN(self, s, a): return self.N.get(str(s) + "/" + str(a), 0) def play(self, number): for episode in xrange(number): total_reward = 0 self.s = None self.a = None reward = 0 while not self.ale.game_over(): state = hash(get_feature(self.ale.getScreen())) action = self.Q_LEARNING_AGENT(state, reward) # Apply an action and get the resulting reward reward = self.ale.act(action) total_reward += reward print 'Episode', episode, 'ended with score:', total_reward self.ale.reset_game() self.mapToTxt(self.Q, 'qvalues.txt') self.mapToTxt(self.N, 'nvalues.txt') def mapToTxt(self, hMap, filepath): f = open(filepath, 'r+') for elem in hMap.keys(): toWrite = str(elem) + " " + str(hMap[elem]) + "\n" f.write(toWrite) f.close() def txtToMap(self, filepath): newMap = {} f = open(filepath) while True: string = f.readline() if not string: break tmp = self.stringSplitter(string) newMap[tmp[0]] = float(tmp[1]) f.close() return newMap def stringSplitter(self, string): i = string.find(' ') head = string[:i] rest = string[i + 1:len(string) - 1] # getting rid of the \n's return (head, rest)
class ALEEnvironment: def __init__(self, rom_file, args=None): from ale_python_interface import ALEInterface self.ale = ALEInterface() # Set Env Variables self.ale.setInt('frame_skip', 1) self.ale.setFloat('repeat_action_probability', 0.0) self.ale.setBool('color_averaging', False) self.ale.setInt('random_seed', 123) self.ale.setBool('sound', False) self.ale.setBool('display_screen', False) self.frame_skip = 4 self.initial_skip_actions = 5 self.screen_width = 160 #84 self.screen_height = 210 #84 self.channels = 3 self.last_screen = np.zeros( (self.screen_height, self.screen_width, self.channels)) self.ale.loadROM(rom_file) self.actions = self.ale.getMinimalActionSet() self.life_lost = False self.training = True def reset(self, train=True): self.training = train if (self.ale.game_over() or not (train and self.life_lost)): self.ale.reset_game() self.last_screen.fill(0.0) for i in range(self.initial_skip_actions): self.step(0) state = self._get_screen() #self.get_screens() return state def step(self, action): reward = 0 lives = self.ale.lives() for i in range(self.frame_skip): reward += self.ale.act(self.actions[action]) if i == (self.frame_skip - 1): self._get_screen() # get screen to update last screen = self._get_screen() #self._add_screen(screen) state = screen #self.get_screens() self.life_lost = (not (lives == self.ale.lives())) terminal = self.ale.game_over() or (self.life_lost and self.training) info = [] return state, reward, terminal, info def numActions(self): return len(self.actions) def _get_screen(self): screen = self.ale.getScreenRGB() #Grayscale() #resized = np.array(cv2.resize(screen, (self.screen_width, self.screen_height))) out_screen = np.maximum(screen, self.last_screen) self.last_screen = screen return out_screen
class GameState(object): def __init__(self, rand_seed, display=False): self.ale = ALEInterface() self.ale.setInt('random_seed', rand_seed) if display: self._setup_display() self.ale.loadROM(ROM) # height=210, width=160 self.screen = np.empty((210, 160, 1), dtype=np.uint8) no_action = 0 self.reward = self.ale.act(no_action) self.terminal = self.ale.game_over() # screenのshapeは、(210, 160, 1) self.ale.getScreenGrayscale(self.screen) # (210, 160)にreshape reshaped_screen = np.reshape(self.screen, (210, 160)) # height=110, width=84にリサイズ resized_screen = cv2.resize(reshaped_screen, (84, 110)) x_t = resized_screen[18:102,:] x_t = x_t.astype(np.float32) x_t *= (1.0/255.0) self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # 実際に利用するactionのみを集めておく self.real_actions = self.ale.getMinimalActionSet() def _setup_display(self): if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) def process(self, action): # 18種類のうちの実際に利用するactionに変換 real_action = self.real_actions[action] self.reward = self.ale.act(real_action) #self.reward = self.ale.act(action) self.terminal = self.ale.game_over() # screenのshapeは、(210, 160, 1) self.ale.getScreenGrayscale(self.screen) # (210, 160)にreshape reshaped_screen = np.reshape(self.screen, (210, 160)) # height=210, width=160 # height=110, width=84にリサイズ resized_screen = cv2.resize(reshaped_screen, (84, 110)) x_t1 = resized_screen[18:102,:] x_t1 = np.reshape(x_t1, (84, 84, 1)) x_t1 = x_t1.astype(np.float32) x_t1 *= (1.0/255.0) self.s_t1 = np.append(x_t1, self.s_t[:,:,0:3], axis = 2) if self.terminal: self.ale.reset_game() def update(self): self.s_t = self.s_t1
class AtariEnv(object): def __init__(self, frame_skip=None, repeat_action_probability=0.0, state_shape=[84, 84], rom_path=None, game_name='pong', random_state=None, rendering=False, record_dir=None, obs_showing=False, channel_weights=[0.5870, 0.2989, 0.1140]): self.ale = ALEInterface() self.frame_skip = frame_skip self.state_shape = state_shape if random_state is None: random_state = np.random.RandomState(1234) self.rng = random_state self.channel_weights = channel_weights self.ale.setInt(b'random_seed', self.rng.randint(1000)) self.ale.setFloat(b'repeat_action_probability', repeat_action_probability) self.ale.setBool(b'color_averaging', False) if rendering: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool(b'sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool(b'sound', True) self.ale.setBool(b'display_screen', True) if rendering and record_dir is not None: # should be before loadROM self.ale.setString(b'record_screen_dir', record_dir.encode()) self.ale.setString(b'record_sound_filename', os.path.join(record_dir, '/sound.wav').encode()) self.ale.setInt(b'fragsize', 64) # to ensure proper sound sync (see ALE doc) self.ale.loadROM(str.encode(rom_path + game_name + '.bin')) self.legal_actions = self.ale.getMinimalActionSet() self.nb_actions = len(self.legal_actions) (self.screen_width, self.screen_height) = self.ale.getScreenDims() self._buffer = np.empty((self.screen_height, self.screen_width, 3), dtype=np.uint8) self.obs_showing = obs_showing def reset(self): self.ale.reset_game() return self.get_state() def step(self, action): reward = 0.0 if self.frame_skip is None: num_steps = 1 elif isinstance(self.frame_skip, int): num_steps = self.frame_skip else: num_steps = self.rng.randint(self.frame_skip[0], self.frame_skip[1]) for i in range(num_steps): reward += self.ale.act(self.legal_actions[action]) return self.get_state(), reward, self.ale.game_over(), {} def _get_image(self): self.ale.getScreenRGB(self._buffer) gray = self.channel_weights[0] * self._buffer[:, :, 0] + self.channel_weights[1] * self._buffer[:, :, 1] + \ self.channel_weights[2] * self._buffer[:, :, 2] x = cv2.resize(gray, tuple(self.state_shape), interpolation=cv2.INTER_LINEAR) return x def get_state(self): return self._get_image() def get_lives(self): return self.ale.lives()
class Environment: def __init__(self, render=False): self.ale = ALEInterface() self.ale.setInt(b'random_seed', 0) self.ale.setFloat(b'repeat_action_probability', 0.0) self.ale.setBool(b'color_averaging', True) self.ale.setInt(b'frame_skip', 4) self.ale.setBool(b'display_screen', render) self.ale.loadROM(ENV.encode('ascii')) self._screen = np.empty((210, 160, 1), dtype=np.uint8) self._no_op_max = 7 self.img_buffer = [] def set_render(self, render): if not render: self.ale.setBool(b'display_screen', render) def reset(self): self.ale.reset_game() # randomize initial state if self._no_op_max > 0: no_op = np.random.randint(0, self._no_op_max + 1) for _ in range(no_op): self.ale.act(0) self.img_buffer = [] self.img_buffer.append(self.ale.getScreenRGB()) self.ale.getScreenGrayscale(self._screen) screen = np.reshape(self._screen, (210, 160)) screen = cv2.resize(screen, (84, 110)) screen = screen[18:102, :] screen = screen.astype(np.float32) screen /= 255.0 self.frame_buffer = np.stack((screen, screen, screen, screen), axis=2) return self.frame_buffer def process(self, action, gif=False): reward = self.ale.act(4+action) done = self.ale.game_over() if gif: self.img_buffer.append(self.ale.getScreenRGB()) self.ale.getScreenGrayscale(self._screen) screen = np.reshape(self._screen, (210, 160)) screen = cv2.resize(screen, (84, 110)) screen = np.reshape(screen[18:102, :], (84, 84, 1)) screen = screen.astype(np.float32) screen *= (1/255.0) self.frame_buffer = np.append(self.frame_buffer[:, :, 1:], screen, axis=2) return self.frame_buffer, reward, done, "" def save_gif(self, path): os.makedirs(os.path.dirname(path), exist_ok=True) imageio.mimsave(path, self.img_buffer, duration=0.001) self.img_buffer = [] def close(self): self.ale.setBool(b'display_screen', False)
class AtariEnvironment: def __init__(self, frame_shape, frame_postprocess=lambda x: x): self.ale = ALEInterface() self.ale.setBool(b"display_screen", cfg.display_screen) self.ale.setInt(b"frame_skip", 1) self.ale.setBool(b"color_averaging", False) self.ale.setInt(b"random_seed", cfg.random_seed) self.ale.setFloat(b"repeat_action_probability", cfg.sticky_prob) self.ale.loadROM(str.encode(cfg.rom)) self.ale.setMode(cfg.mode) self.ale.setDifficulty(cfg.difficulty) self.action_set = self.ale.getLegalActionSet() # self.action_set = self.ale.getMinimalActionSet() assert len(self.action_set) == cfg.num_actions screen_dims = tuple(reversed(self.ale.getScreenDims())) + (1, ) self._frame_buffer = CircularBuffer(cfg.frame_buffer_size, screen_dims, np.uint8) self._frame_stack = CircularBuffer(cfg.frame_history_size, frame_shape, np.uint8) self._frame_postprocess = frame_postprocess self._episode_count = 0 self.reset(inc_episode_count=False) def _is_terminal(self): return self.ale.game_over() def _get_single_frame(self): stacked_frames = np.concatenate(self._frame_buffer, axis=2) maxed_frame = np.amax(stacked_frames, axis=2) expanded_frame = np.expand_dims(maxed_frame, 3) frame = self._frame_postprocess(expanded_frame) return frame def reset(self, inc_episode_count=True): self._episode_frames = 0 self._episode_reward = 0 if inc_episode_count: self._episode_count += 1 self.ale.reset_game() for _ in range(cfg.frame_buffer_size): self._frame_buffer.append(self.ale.getScreenGrayscale()) for _ in range(cfg.frame_history_size): self._frame_stack.append(self._get_single_frame()) def act(self, action): assert not self._is_terminal() cum_reward = 0 for _ in range(cfg.frame_skip): cum_reward += self.ale.act(self.action_set[action]) self._frame_buffer.append(self.ale.getScreenGrayscale()) self._frame_stack.append(self._get_single_frame()) self._episode_frames += cfg.frame_skip self._episode_reward += cum_reward cum_reward = np.clip(cum_reward, -1, 1) return cum_reward, self.state, self._is_terminal() @property def state(self): assert len(self._frame_buffer) == cfg.frame_buffer_size assert len(self._frame_stack) == cfg.frame_history_size return np.concatenate(self._frame_stack, axis=-1) @property def episode_reward(self): return self._episode_reward @property def episode_frames(self): return self._episode_frames @property def episode_steps(self): return self._episode_frames // cfg.frame_skip @property def episode_count(self): return self._episode_count