class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() totalscore = 0 score = 0 count = nb_episodes while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead action = agent.policy(agent.state_binner(env.game.getGameState())) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over(): totalscore += 1 print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("average for this run is :%d" % (totalscore / count))
def test(): # 创建环境 game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) act_dim = len(env.getActionSet()) print('action set:', env.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './DQN/checkpoints/episode_V14600.ckpt' print('checkpoints:', save_path) if os.path.exists(save_path): logger.info('load ckpt success!') agent.restore(save_path) else: logger.error('load ckpt error!') action_set = env.getActionSet() env.init() episode_reward = 0 steps = 0 while not env.game_over(): steps += 1 if (steps == 1): continue obs = list(env.getGameState().values()) action_idx = agent.predict(obs) # 预测动作,只选最优动作 act = action_set[action_idx] reward = env.act(act) episode_reward += reward reward_str = str(int(episode_reward)) drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0), (255, 255, 255)) env.reset_game() logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
class Game: def __init__(self, game="pixelcopter", fps=30): os.environ['SDL_VIDEODRIVER'] = 'dummy' self.game_name = game if game == "flappy": engine = FlappyBird() elif game == "pixelcopter": engine = Pixelcopter() else: assert False, "This game is not available" engine.rewards["loss"] = -5 # reward at terminal state self.reward_terminal = -5 self.game = PLE(engine, fps=fps, display_screen=False) self.game.init() self.game.act(0) # Start the game by providing arbitrary key as input self.key_input = self.game.getActionSet() self.reward = 0 def game_over(self): return self.game.game_over() def reset_game(self): self.game.reset_game() self.game.act(0) # Start the game def get_image(self): return self.game.getScreenRGB() def get_torch_image(self): image = self.game.getScreenRGB() if self.game_name == "flappy": image = image[:, :-96, :] # Remove ground image = cv2.cvtColor(cv2.resize(image, (84, 84)), cv2.COLOR_BGR2GRAY) image = np.reshape(image, (84, 84, 1)) elif self.game_name == "pixelcopter": image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = np.reshape(image, (48, 48, 1)) image[image > 0] = 1 image = image.transpose(2, 0, 1) #CHW image = image.astype(np.float32) image = torch.from_numpy(image) return image def act(self, action_idx): self.reward = self.game.act(self.key_input[action_idx]) return self.reward
def test(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) action_dim = 2 # 只能是up键,还有一个其它,所以是2 model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=action_dim, e_greed=0.2, # explore e_greed_decrement=1e-6) if os.path.exists('./model_dir'): agent.restore('./model_dir') # test part, run 5 episodes and average eval_reward = [] for i in range(5): env.init() episode_reward = 0 isOver = False step = 0 while not isOver: if step == 0: reward = env.act(None) done = False else: time.sleep(0.01) #windows running too fast, slow it down obs = list(env.getGameState().values()) action = agent.predict(obs) if action == 1: act = actions["up"] else: act = None reward = env.act(act) isOver = env.game_over() episode_reward += reward step += 1 eval_reward.append(episode_reward) if step > MAX_STEP: break env.reset_game() return np.mean(eval_reward)
def run(): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) #agent = myAgentHere(allowed_actions=p.getActionSet()) p.init() reward = 0.0 for i in range(150): if p.game_over(): p.reset_game() observation = p.getScreenRGB() new_image = convert_image(observation) cv.imwrite("Imagenes/Gray_Image" + str(i) + ".jpg", new_image) action = None reward = p.act(action)
class SnakeEnv(): def __init__(self, height=32, width=32, fps=15, frame_history_size=4): # create the game environment and initialize the attribute values self.game = Snake(height=height,width=width,init_length=4) reward_dict = {"positive": 1.0, "negative": -1.0, "tick": 0.0, "loss": -1.0, "win": 1.0} self.environment = PLE(self.game, fps=fps, reward_values=reward_dict, num_steps=2) self.init_env() # initialize the game self.allowed_actions = self.environment.getActionSet() # the list of allowed actions to be taken by an agent self.num_actions = len(self.allowed_actions) - 1 # number of actions that are allowed in this env self.frame_hist = frame_history(height=height,width=width,frame_history_size=frame_history_size,num_channels=3); self.input_shape = self.frame_hist.get_history().shape # shape of the game input screen def init_env(self): # initialize the variables and screen of the game self.environment.init() def get_current_state(self): # get the current state in the game. Returns the current screen of the game with snake and food positions with # a sequence of past . cur_frame = np.transpose(self.environment.getScreenRGB(),(2,0,1)) #cur_frame = np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1)) self.frame_hist.push(cur_frame) return self.frame_hist.get_history() def check_game_over(self): # check if the game has terminated return self.environment.game_over() def reset(self): # resets the game to initial values and refreshes the screen with a new small snake and random food position. self.environment.reset_game() _ = self.environment.act(None) self.frame_hist.reset(np.transpose(self.environment.getScreenRGB(),(2,0,1))) #self.frame_hist.reset(np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1))) return self.frame_hist.get_history() def take_action(self, action): # lets the snake take the chosen action of moving in some direction reward = self.environment.act(self.allowed_actions[action]) next_state = self.get_current_state() done = self.check_game_over() return next_state, reward, done, 0
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load("model95000") batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) state = next_state # time.sleep(0.3) # Plot socre if i % 1000 == 0: plot(data)
def play(file_name, number_of_games=1): game = FlappyBird(width=game_width, height=game_height, pipe_gap=game_pipe_gap) p = PLE(game, display_screen=True, force_fps=False, frame_skip=6) p.init() network = Network() network.load(file_name, rename=False) for i in range(number_of_games): if i > 0: p.reset_game() while not p.game_over(): state = p.getGameState() actions_q_values = network.Q(state).tolist() action_taken_index = np.argmax(actions_q_values) p.act(None if action_taken_index == 0 else 119)
def evaluate_step(agent, seed, sess): game = FlappyBird() env = PLE(game, fps=30, display_screen=False, rng=np.random.RandomState(seed)) env.reset_game() env.act(0) # dummy input # grayscale input screen for this episode input_screens = [agent.preprocess(env.getScreenGrayscale())] t = 0 while not env.game_over(): # feed four previous screen, select an action action = agent.select_action(input_screens, sess) # execute the action and get reward reward = env.act(env.getActionSet()[action]) # reward = +1 when pass a pipe, -5 when die # observe the result screen_plum = env.getScreenGrayscale() # get next screen # append grayscale screen for this episode input_screens.append(agent.preprocess(screen_plum)) t+=1 if t >= 1000: # maximum score to prevent run forever break return t
class FlappyBirdEnv(gym.Env): def __init__(self): self.resize_factor = 0.125 self.width = 288 self.height = 512 self.ple = PLE(game=FlappyBird(), fps=30, frame_skip=8) self.action_set = self.ple.getActionSet() self.action_space = spaces.Discrete(len(self.action_set)) self.observation_space = spaces.Box( low=0.0, high=255.0, shape=( int(self.width * self.resize_factor), int(self.height * self.resize_factor), 1, ), dtype=np.uint32, ) self._steps = 0 def reset(self): self._steps = 0 self.ple.display_screen = False self.ple.reset_game() return self._get_state() def step(self, action): self._steps += 1 reward = self.ple.act(self.action_set[action]) next_state = self._get_state() terminal = self.ple.game_over() return next_state, reward, terminal, {} def render(self, mode="human"): self.ple.display_screen = True def _get_state(self): return np.expand_dims(imresize(self.ple.getScreenGrayscale(), self.resize_factor), axis=-1)
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = FlappyBird() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.00) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
class PLEEnvRam(PLEEnv): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self._get_game_state().shape) self.viewer = None def _step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_game_state() terminal = self.game_state.game_over() return state, reward, terminal, self.game_state.game.getGameState() def _get_game_state(self): gs = self.game_state.game.getGameState() names = sorted(gs.keys()) state = np.array([gs[n] for n in names], dtype=np.float64) return state @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.game_state.reset_game() state = self._get_game_state() return state
def play(self): print('Playing {} agent after training for {} episodes or {} frames'. format(self.name, self.num_of_episodes, self.num_of_frames)) reward_values = { 'positive': 1.0, 'negative': 0.0, 'tick': 0.0, 'loss': 0.0, 'win': 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() score = 0 last_print = 0 nb_episodes = 50 while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over(): print('Score: {}'.format(score)) env.reset_game() nb_episodes -= 1 score = 0
def play(self, fast=True): """Use athlete to play. Args: fast <bool>: set to True if the screen should be hidden and speed enhanced """ game = FlappyBird() env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=fast, display_screen=not fast) env.init() pipes = [] i = 0 while i < 100: env.reset_game() pipes.append(0) while not env.game_over(): A = self.act(game.getGameState()) r = env.act(ACTIONS[A]) if r == 1.: pipes[-1] += 1 if not fast: print('\n- Score: {} pipes'.format(pipes[-1])) print('- Played {} games'.format(len(pipes))) print('- Average score: {} pipes'.format(np.round(np.mean(pipes), decimals=1))) else: i += 1 print('\n- Max score: {} pipes'.format(np.max(pipes))) print('- Games < 15 pipes: {}'.format( len(tuple(filter(lambda x: x < 15, pipes))) )) print('- Played {} games'.format(100)) print('- Average score: {} pipes'.format( np.round(np.mean(pipes), decimals=1)) )
def main(w, seed=SEED, headless=False): """ Let an agent play flappy bird """ if headless: display_screen = False force_fps = True else: display_screen = True force_fps = False game = PLE(FLAPPYBIRD, display_screen=display_screen, force_fps=force_fps, rng=seed) game.init() game.reset_game() FLAPPYBIRD.rng.seed(seed) agent_score = 0 num_frames = 0 while True: if game.game_over(): break obs = game.getGameState() x = normalize(obs) action = agent(x, w) reward = game.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 num_frames += 1 print('Frames :', num_frames) print('Score :', agent_score)
class FlappyBirdWrapper(Env): # 如果想把画面渲染出来,就传参display_screen=True def __init__(self, **kwargs): self.game = FlappyBird() self.p = PLE(self.game, **kwargs) self.action_set = self.p.getActionSet() # 3个输入状态:见函数self._get_obs self.observation_space = spaces.Discrete(3) # 两个输出状态:跳或者不跳 self.action_space = spaces.Discrete(2) def _get_obs(self): # 获取游戏的状态 state = self.game.getGameState() # 小鸟与它前面一对水管中下面那根水管的水平距离 dist_to_pipe_horz = state["next_pipe_dist_to_player"] # 小鸟与它前面一对水管中下面那根水管的顶端的垂直距离 dist_to_pipe_bottom = state["player_y"] - state["next_pipe_top_y"] # 获取小鸟的水平速度 velocity = state['player_vel'] # 将这些信息封装成一个数据返回 return np.array([dist_to_pipe_horz, dist_to_pipe_bottom, velocity]) def reset(self): self.p.reset_game() return self._get_obs() def step(self, action): reward = self.p.act(self.action_set[action]) obs = self._get_obs() done = self.p.game_over() return obs, reward, done, dict() def seed(self, *args, **kwargs): pass def render(self, *args, **kwargs): pass
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() score = 0 while nb_episodes > 0: # pick an action action = agent.policy(env.game.getGameState()) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over(): print(score, nb_episodes) env.reset_game() nb_episodes -= 1 if score > agent.highestScore: agent.highestScore = score agent.totalScore += score score = 0
def test_model_G(nb_games, model): game = FlappyBird( graphics="fixed" ) # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p.init() reward = 0.0 cumulated = np.zeros((nb_games)) list_actions = [0, 119] for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen_x = process_screen(p.getScreenRGB()) stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) x = np.stack(stacked_x, axis=-1) action = list_actions[np.argmax( model.predict(np.expand_dims(x, axis=0)))] reward = p.act(action) cumulated[i] = cumulated[i] + reward avg_score = np.mean(cumulated) print('Average : ' + str(avg_score)) mx_score = np.max(cumulated) print('Max : ' + str(mx_score)) return avg_score, mx_score
def test(): game2 = FlappyBird() p2 = PLE(game2, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p2.init() reward = 0.0 nb_games = 10 cumulated = np.zeros((nb_games)) for i in range(nb_games): p2.reset_game() while (not p2.game_over()): state = game2.getGameState() screen = p2.getScreenRGB() action = FlappyPolicy(state, screen) reward = p2.act(action) cumulated[i] = cumulated[i] + reward return np.mean(cumulated)
def random_play(episodes = 100): # Initialize game and agent game = FlappyBird() p = PLE(game, display_screen=True, state_preprocessor=process_state) p.init() agent = Agent(p) total_reward = [] # Run given number of episodes for _ in range(episodes): # Initialize episode p.reset_game() total_episode_reward = 0 # Episode loop while not p.game_over(): action = agent.choose_action() reward = p.act(action) total_episode_reward += reward # Save episode reward and return total_reward.append(total_episode_reward) return total_reward
print("loaded") else: agent = RLAgent(.3, .9, actions) game = frogger_new.Frogger() fps = 30 p = PLE(game, fps=fps, force_fps=False) reward = 0.0 p.init() count = 0 tr = 0 try: while True: count += 1 if p.game_over(): #print("{} REWARD".format(tr)) tr = 0 p.reset_game() obs = game.getGameState() #print obs cur = State(obs) action = agent.pickAction(cur)[0] #action = None reward = p.act(action) newState = State(obs) #disincentivizing no move #if cur == newState: reward -= .1
import FlappyPolicy game = FlappyBird() p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action = FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
import numpy as np from ple import PLE from ple.games.snake import Snake agent = Snake(width=256, height=256) env = PLE(agent, fps=15, force_fps=False, display_screen=True) env.init() actions = env.getActionSet() for i in range(1000): if env.game_over(): env.reset_game() action = actions[np.random.randint(0, len(actions))] env.act(action)
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 INITIAL_FEATURES = np.zeros((4, INPUT_SIZE)) MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.actor = Actor('actor', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories_and_train(self, replay): # state_r, action_r, reward_r, new_state_r, dead_r = replay # train actor states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states, self.actor.hidden_layer:init_values}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states, self.actor_target.hidden_layer:init_values}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values, self.critic.Y:Q}) self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor.hidden_layer:init_values, self.actor_critic_grad:grads}) # train critic rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states, self.critic_target.hidden_layer: init_values, self.critic_target.Y:Q_target}) for i in range(len(replay)): if not replay[0][-1]: rewards[i,0] += self.GAMMA * rewards_target cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer), feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values, self.critic.Y:Q, self.critic.REWARD:rewards}) return cost def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() dead = False init_value = np.zeros((1, 2 * 512)) state = self.get_state() for i in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[i,:] = state while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor', 'actor-target') self._assign('critic', 'critic-target') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = sess.run(self.model.logits, self.model.last_state, feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_values}) action, init_value = np.argmax(action[0]), last_state[0] real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = np.append(self.get_state(), self.INITIAL_FEATURES[:3, :], axis = 0) dead = self.env.game_over() self._memorize(state, action, reward, new_state, dead, init_value) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) self.T_COPY += 1 self.rewards.append(total_reward) if (i+1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState() reward, action = agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if num_episodes % 5 == 0:
class PLEEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None def _step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_image() terminal = self.game_state.game_over() return state, reward, terminal, {} def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000): # training parameters num_epochs = 5 num_steps_train_epoch = num_steps_train_total/num_epochs # steps per epoch of training num_steps_test = 100 update_frequency = 10 # step frequency of model training/updates epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 1000 # decay steps epsilon_min = 0.1 epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # memory settings max_memory_size = 10000 min_memory_size = 60 # number needed before model training starts game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = init_agent(env) memory = utils.ReplayMemory(max_memory_size, min_memory_size) env.init() # Logging configuration and figure plotting logging.basicConfig(filename='../learning.log', filemode='w', level=logging.DEBUG, format='%(levelname)s:%(message)s') logging.info('========================================================') logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n') learning_rewards = [0] testing_rewards = [0] for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train_epoch: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_train_epoch: state = env.getGameState() reward, action = my_agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(my_agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if steps < num_steps_train_epoch: learning_rewards.append(episode_reward) if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)) steps, num_episodes = 0, 0 losses, rewards = [], [] # testing loop while steps < num_steps_test: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_test: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward testing_rewards.append(testing_rewards[-1]+reward) steps += 1 # done watching after 500 steps. if steps > 500: env.display_screen = False if num_episodes % 5 == 0: logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) if steps < num_steps_test: testing_rewards.append(episode_reward) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)) logging.info("Training complete.\n\n") plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total) plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total) save_agent(my_agent, agent_file_path, agent_file_name)
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
epoch = step // params.EVALUATION_PERIOD mean_score, max_score, min_score = evaluation(p, network=dqn, epoch=epoch, trials=20, logger=logger_eval) logger_eval.info('Score min/max ({}/{}) and mean ({})'.format(min_score, max_score, mean_score)) # action selection if np.random.rand() < epsilon(step): a = np.random.randint(0, 2) else: a = greedy_action(dqn, x) # 0 or 1 # step r = p.act(params.LIST_ACTIONS[a]) rr = clip_reward(r) screen_y = process_screen(p.getScreenRGB()) d = p.game_over() replay_memory.append(screen_x, a, rr, screen_y, d) # print some info if d: print("##################################################################################################") print("#################################################################### DEAD ########################") print("##################################################################################################") logger_train.info("Step {}, score before death: {}".format(step, training_score)) training_score = 0 # Restart game if r > 0: print("--------------------------------------------------------------------------------------------------") print("-------------------------------------------------------------------- Pipe passed ! ---------------") print("--------------------------------------------------------------------------------------------------") training_score += 1 # One pipe passed
import numpy as np from ple import PLE from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
from ple.games.flappybird import FlappyBird from ple import PLE import random game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=False) p.init() nb_frames = 1000 reward = 0.0 for f in range(nb_frames): if p.game_over(): #check if the game is over p.reset_game() obs = p.getScreenRGB() action = random.sample(p.getActionSet(), 1)[0] reward = p.act(action) print(action, reward)
memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs+1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False #training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState() reward, action = agent.act(state, epsilon=epsilon) memory.add([ state, action, reward, env.game_over() ]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1
env.reset_game() state = env.getGameState() state = np.array([list(state[0])]) score = 0 for time_t in range(20000): action = agent.act(state) reward = env.act(action) #액션 선택 score += reward next_state = env.getGameState() next_state = np.array([list(next_state[0])]) action = [K_a, None, K_d].index(action) agent.remember(state, action, reward, next_state, env.game_over()) state = next_state if env.game_over() or time_t == 19999: #에피소드가 끝나면 출력 print( "episode: {}/{}, score: {}, memory size: {}, e: {}".format( e, EPISODES, score, len(agent.memory), agent.epsilon)) #리워드 플랏을 위한 코드 scores.append(score) time.append(e + 1) if e % 10 == 0: pylab.plot(time, scores, 'b') pylab.savefig("./save/catcher_dqn.png") break
""" def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False if f > 2250: env.display_screen = True env.force_fps = True