class WrappedFlappyBird(): def __init__(self): self.score_counter = 0 self.game = FlappyBird() self.env = PLE(self.game, fps=30, display_screen=True) def frame_step(self, action_vector): if action_vector[0] == 1: self.env.act(119) elif action_vector[1] == 1: self.env.act(1) frame = self.env.getScreenRGB() reward = self.get_action_reward() game_over = self.game.game_over() if game_over: self.game.reset() return frame, reward, game_over def get_action_reward(self): if self.game.game_over(): self.score_counter = 0 return -1 elif self.score_counter < self.game.getScore(): self.score_counter = self.game.getScore() return 1 else: return 0.1
class Agent: AGENT_HISTORY_LENGTH = 1 NUM_OF_ACTIONS = 2 POPULATION_SIZE = 15 EPS_AVG = 1 SIGMA = 0.1 LEARNING_RATE = 0.03 INITIAL_EXPLORATION = 0.0 FINAL_EXPLORATION = 0.0 EXPLORATION_DEC_STEPS = 100000 def __init__(self): self.model = Model() self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE) self.exploration = self.INITIAL_EXPLORATION def get_predicted_action(self, sequence): prediction = self.model.predict(np.array(sequence)) x = np.argmax(prediction) return 119 if x == 1 else None def load(self, filename='weights.pkl'): with open(filename,'rb') as fp: self.model.set_weights(pickle.load(fp)) self.es.weights = self.model.get_weights() def get_observation(self): state = self.env.getGameState() return np.array(state.values()) def save(self, filename='weights.pkl'): with open(filename, 'wb') as fp: pickle.dump(self.es.get_weights(), fp) def play(self, episodes): self.env.display_screen = True self.model.set_weights(self.es.weights) for episode in xrange(episodes): self.env.reset_game() observation = self.get_observation() sequence = [observation]*self.AGENT_HISTORY_LENGTH done = False score = 0 while not done: action = self.get_predicted_action(sequence) reward = self.env.act(action) observation = self.get_observation() sequence = sequence[1:] sequence.append(observation) done = self.env.game_over() if self.game.getScore() > score: score = self.game.getScore() print "score: %d" % score self.env.display_screen = False def train(self, iterations): self.es.run(iterations, print_step=1) def get_reward(self, weights): total_reward = 0.0 self.model.set_weights(weights) for episode in xrange(self.EPS_AVG): self.env.reset_game() observation = self.get_observation() sequence = [observation]*self.AGENT_HISTORY_LENGTH done = False while not done: self.exploration = max(self.FINAL_EXPLORATION, self.exploration - self.INITIAL_EXPLORATION/self.EXPLORATION_DEC_STEPS) if random.random() < self.exploration: action = random.choice([119, None]) else: action = self.get_predicted_action(sequence) reward = self.env.act(action) reward += random.choice([0.0001, -0.0001]) total_reward += reward observation = self.get_observation() sequence = sequence[1:] sequence.append(observation) done = self.env.game_over() return total_reward/self.EPS_AVG
class GymFlappy(gym.Env, EzPickle): def __init__(self, config=None): EzPickle.__init__(self) # Aid options self.pre_play = True self.force_calm = False self.positive_counts = 0 self.display_screen = False if config: self.display_screen = config['display_screen'] self.observation_space = spaces.Box(0, 1, shape=(8, ), dtype=np.float32) self.action_space = weightedDiscrete(2) #spaces.Discrete(2) self.vel_max = 15 self.vel_min = -15 self.dist_max = 500 self.dist_min = 0 self.y_max = 500 self.y_min = 0 self.game = FlappyBird(graphics="fancy") self.p = PLE(self.game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=self.display_screen, rng=0) self.p.rng = self.game.rng self.game.player.rng = self.game.rng self.p.init() self.current_t = 0 self.max_t = 1000 def _get_obs(self): state = self.game.getGameState() obs = np.empty((8, )) obs[0] = (state["player_y"] - self.y_min) / (self.y_max - self.y_min) obs[1] = (state["next_pipe_dist_to_player"] - self.dist_min) / (self.dist_max - self.dist_min) obs[2] = (state["next_pipe_top_y"] - self.y_min) / (self.y_max - self.y_min) obs[3] = (state["next_pipe_bottom_y"] - self.y_min) / (self.y_max - self.y_min) obs[4] = (state["next_next_pipe_dist_to_player"] - self.dist_min) / (self.dist_max - self.dist_min) obs[5] = (state["next_next_pipe_top_y"] - self.y_min) / (self.y_max - self.y_min) obs[6] = (state["next_next_pipe_bottom_y"] - self.y_min) / (self.y_max - self.y_min) obs[7] = (state["player_vel"] - self.vel_min) / (self.vel_max - self.vel_min) return obs def reset(self): self.current_t = 0 self.p.reset_game() if self.pre_play: # Get rid of the first second of game ini_fc = self.force_calm self.force_calm = False for i in range(25): a = 0 if i % 10 == 0: a = 1 self.step(np.array([a])) self.force_calm = ini_fc return self._get_obs() def step(self, action): self.current_t += 1 reward = self.p.act(119 if action == 1 else 0) if self.force_calm: # ensures each action is followed by no action for i in range(1): r = self.p.act(0) reward += r done = self.current_t >= self.max_t or self.p.game_over() done = done or self._double_check_done() info = {} return self._get_obs(), reward, done, info def __getstate__(self): dc = lambda x: copy.deepcopy(x) # get all game attributes _game_state = self.game.__dict__ _player_state = self.game.player.__dict__ _pipe_state = self.game.pipe_group.__dict__ pipe_sprites = self.game.pipe_group.spritedict pipe_xs = [] pipe_ys = [] pipe_rects = [] for _, sprite in enumerate(pipe_sprites): pipe_xs.append(dc(sprite.x)) pipe_ys.append(dc(sprite.gap_start)) pipe_rects.append(dc(pipe_sprites[sprite])) lives = dc(self.game.lives) score = dc(self.game.getScore()) pscore = dc(self.p.previous_score) # remove images (heavy and require additional serialization): __game_state = {} __player_state = {} for attr in _game_state: if attr in [ 'screen', 'images', 'clock', 'player', 'backdrop', "pipe_group" ]: pass else: __game_state[attr] = _game_state[attr] for attr in _player_state: if attr in ['image', 'image_assets']: pass else: __player_state[attr] = _player_state[attr] # accomodate multiple envs in parallel game_state = dc(__game_state) player_state = dc(__player_state) pipe_state = _pipe_state # this is a non-PLE parameter that needs to be reset too envtime = dc(self.current_t) rng_state = self.game.rng.get_state() stategroup = (game_state, player_state, pipe_state, (pipe_xs, pipe_rects, pipe_ys), lives, envtime, rng_state, score, pscore) return stategroup def __setstate__(self, stategroup): ''' Stategroup required (ugly yet somewhat functional): 0 game_state dictionary (game.__dict__) 1 player_state dictionary (game.player.__dict__) 2 pipe_state idctionary (game.pipe_group.__dict__) 3 x positions of pipes in game (list) 4 lives (game.lives, used in game.game_over()) 5 current time (self.current_t) 6 rng state ''' # use update to preserve images we didn't save self.game.__dict__.update(stategroup[0]) self.game.player.__dict__.update(stategroup[1]) #self.game.pipe_group.__dict__.update(stategroup[2]) # was introducing reference crossing pipe_sprites = self.game.pipe_group.spritedict for i, sprite in enumerate(pipe_sprites): sprite.x = stategroup[3][0][i] pipe_sprites[sprite] = stategroup[3][1][i] sprite.gap_start = stategroup[3][2][i] self.game.lives = stategroup[4] # prevent Gym env to return false dones self.current_t = stategroup[5] self.game.rng.set_state(stategroup[6]) # fix stupid reward self.game.score = stategroup[7] self.p.previous_score = stategroup[8] return self._get_obs() def get_state(self): return self.__getstate__() def set_state(self, state): return self.__setstate__(state) def reset_counts(self): self.positive_counts = 0 def _double_check_done(self): ''' Manually inspects game to detect collisions Worthy of suicide but necessary... ''' # Check pipe collisions for p in self.game.pipe_group: hit = pygame.sprite.spritecollide(self.game.player, self.game.pipe_group, False) is_in_pipe = (p.x - p.width / 2 - 20) <= self.game.player.pos_x < (p.x + p.width / 2) for h in hit: # do check to see if its within the gap. top_pipe_check = ( (self.game.player.pos_y - self.game.player.height / 2 + 12) <= h.gap_start) and is_in_pipe bot_pipe_check = ( (self.game.player.pos_y + self.game.player.height) > h.gap_start + self.game.pipe_gap) and is_in_pipe boom = bot_pipe_check or top_pipe_check if boom: return True # floor limit if self.game.player.pos_y >= 0.79 * self.game.height - self.game.player.height: return True # went above the screen if self.game.player.pos_y <= 0: return True return False