def main(): # 创建环境 game = Snake(width=256, height=256, init_length=10) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent p.reset_game() print(p.getActionSet()) act_dim = len(p.getActionSet()) obs_dim = 256 * 256 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # # 加载模型 # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'): # agent.restore('model_dir/pg_pong_episode_19.ckpt') best_total_reward = -float('inf') for i in range(50000): obs_list, action_list, reward_list = run_episode(p, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 50 == 0: total_reward = evaluate(p, agent, render=True) if total_reward > best_total_reward: best_total_reward = total_reward agent.save( 'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format( i, total_reward)) logger.info('Test reward: {}'.format(total_reward))
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = {"positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0} # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values = reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() totalscore = 0 count = nb_episodes score = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead action = agent.policy(agent.state_binner(env.game.getGameState())) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over(): totalscore += score print(count) print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("average for this run is :%d" % (totalscore/count))
def main(): # 创建环境 game = Snake(width=96, height=96, init_length=6) p = PLE(game, fps=30, display_screen=False, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, act_dim=act_dim, e_greed_decrement=0, e_greed=0.1) # e_greed有一定概率随机选取动作,探索 # 加载模型 if os.path.exists('./dqn_snake_60.ckpt'): agent.restore('./dqn_snake_60.ckpt') # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 20000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 20): total_reward = run_episode(p, agent, rpm) if i%5==0: logger.info('episode:{} train_reward:{}'.format(episode, total_reward)) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward>best_reward: best_reward = eval_reward agent.save('model_dir/dqn_snake_{}.ckpt'.format(episode)) logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward))
def __init__(self, config=None): EzPickle.__init__(self) # Aid options self.pre_play = True self.force_calm = False self.positive_counts = 0 self.display_screen = False if config: self.display_screen = config['display_screen'] self.observation_space = spaces.Box(0, 1, shape=(8, ), dtype=np.float32) self.action_space = weightedDiscrete(2) #spaces.Discrete(2) self.vel_max = 15 self.vel_min = -15 self.dist_max = 500 self.dist_min = 0 self.y_max = 500 self.y_min = 0 self.game = FlappyBird(graphics="fancy") self.p = PLE(self.game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=self.display_screen, rng=0) self.p.rng = self.game.rng self.game.player.rng = self.game.rng self.p.init() self.current_t = 0 self.max_t = 1000
def play(self): print('Playing {} agent after training for {} episodes or {} frames'. format(self.name, self.num_of_episodes, self.num_of_frames)) reward_values = { 'positive': 1.0, 'negative': 0.0, 'tick': 0.0, 'loss': 0.0, 'win': 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() score = 0 last_print = 0 nb_episodes = 50 while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over(): print('Score: {}'.format(score)) env.reset_game() nb_episodes -= 1 score = 0
def __init__(self, game_name, display_screen=True): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, frame_skip=2, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None self.count = 0
def __init__(self, game, display_screen=False): from ple import PLE assert game in [ 'catcher', 'monsterkong', 'flappybird', 'pixelcopter', 'pong', 'puckworld', 'raycastmaze', 'snake', 'waterworld' ] if game == 'catcher': from ple.games.catcher import Catcher env = Catcher() elif game == 'monsterkong': from ple.games.monsterkong import MonsterKong env = MonsterKong() elif game == 'flappybird': from ple.games.flappybird import FlappyBird env = FlappyBird() elif game == 'pixelcopter': from ple.games.pixelcopter import Pixelcopter env = Pixelcopter() elif game == 'pong': from ple.games.pong import Pong env = Pong() elif game == 'puckworld': from ple.games.puckworld import PuckWorld env = PuckWorld() elif game == 'raycastmaze': from ple.games.raycastmaze import RaycastMaze env = RaycastMaze() elif game == 'snake': from ple.games.snake import Snake env = Snake() elif game == 'waterworld': from ple.games.waterworld import WaterWorld env = WaterWorld() self.p = PLE(env, fps=30, display_screen=display_screen) self.action_set = self.p.getActionSet() self.action_size = len(self.action_set) self.screen_dims = self.p.getScreenDims() self.p.init()
def play(self, fast=True): """Use athlete to play. Args: fast <bool>: set to True if the screen should be hidden and speed enhanced """ game = FlappyBird() env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=fast, display_screen=not fast) env.init() pipes = [] i = 0 while i < 100: env.reset_game() pipes.append(0) while not env.game_over(): A = self.act(game.getGameState()) r = env.act(ACTIONS[A]) if r == 1.: pipes[-1] += 1 if not fast: print('\n- Score: {} pipes'.format(pipes[-1])) print('- Played {} games'.format(len(pipes))) print('- Average score: {} pipes'.format(np.round(np.mean(pipes), decimals=1))) else: i += 1 print('\n- Max score: {} pipes'.format(np.max(pipes))) print('- Games < 15 pipes: {}'.format( len(tuple(filter(lambda x: x < 15, pipes))) )) print('- Played {} games'.format(100)) print('- Average score: {} pipes'.format( np.round(np.mean(pipes), decimals=1)) )
def __init__(self, game_name='FlappyBird', display_screen=True): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() #* converts non-visual state representation to numpy array def process_state(state): return np.array([ state.values() ]) self.game_state = PLE(game, fps=30, display_screen=display_screen, state_preprocessor=process_state) #* added state_preprocessor self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.game_state.getScreenDims() #self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) #self.observation_space = spaces.Box(self.low, self.high) self.viewer = None
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = FlappyBird() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
def play_flappy_bird(play_game=True, train_agent=True, agent_model_path='model.h5'): game = FlappyBird() environment = PLE(game, fps=30, display_screen=True) action_len = 2 states = [] for key, value in game.getGameState().items(): states.append(value) print(states) state_len = len(states) agent_explored_states = FlappyBirdAgent(state_len, action_len) if os.path.exists(agent_model_path): agent_explored_states.load_agent_experience(agent_model_path) # environment.init() if train_agent: agent_explored_states.train(environment, game) print("Trained") if play_game: agent_explored_states.play(environment, game) print("Played") agent_explored_states.save_agent_experience(agent_model_path)
def __init__(self, task={}): self._task = task os.environ['SDL_VIDEODRIVER'] = 'dummy' import importlib game_module = importlib.import_module('ple.games.customgame') game = getattr(game_module, 'customgame')() self.game_state = PLE(game, fps=30, display_screen=False) self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.num_actions = len(self._action_set) self.viewer = None # env tracking variables self.done_counter = 0 self.curr_task = None self.t = 0 self.reward_mult = 1.0
def main(w, seed=SEED, headless=False): """ Let an agent play flappy bird """ if headless: display_screen = False force_fps = True else: display_screen = True force_fps = False game = PLE(FLAPPYBIRD, display_screen=display_screen, force_fps=force_fps, rng=seed) game.init() game.reset_game() FLAPPYBIRD.rng.seed(seed) agent_score = 0 num_frames = 0 while True: if game.game_over(): break obs = game.getGameState() x = normalize(obs) action = agent(x, w) reward = game.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 num_frames += 1 print('Frames :', num_frames) print('Score :', agent_score)
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, None, self.INPUT_SIZE)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple = False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512)) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.tensor_action, self.tensor_validation = tf.split(self.rnn[:, -1,:],2,1) self.feed_action = tf.matmul(self.tensor_action, action_layer) self.feed_validation = tf.matmul(self.tensor_validation, action_layer) self.logits = self.feed_validation + tf.subtract(self.feed_action,tf.reduce_mean(self.feed_action,axis=1,keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def test_model_G(nb_games, model): game = FlappyBird( graphics="fixed" ) # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p.init() reward = 0.0 cumulated = np.zeros((nb_games)) list_actions = [0, 119] for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen_x = process_screen(p.getScreenRGB()) stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) x = np.stack(stacked_x, axis=-1) action = list_actions[np.argmax( model.predict(np.expand_dims(x, axis=0)))] reward = p.act(action) cumulated[i] = cumulated[i] + reward avg_score = np.mean(cumulated) print('Average : ' + str(avg_score)) mx_score = np.max(cumulated) print('Max : ' + str(mx_score)) return avg_score, mx_score
def __init__(self, map_config): self.map_config = map_config self.game = MonsterKong(self.map_config) self.fps = 30 self.frame_skip = 1 self.num_steps = 1 self.force_fps = True self.display_screen = True self.nb_frames = 500 self.reward = 0.0 self.episode_end_sleep = 0.2 if 'fps' in map_config: self.fps = map_config['fps'] if 'frame_skip' in map_config: self.frame_skip = map_config['frame_skip'] if 'force_fps' in map_config: self.force_fps = map_config['force_fps'] if 'display_screen' in map_config: self.display_screen = map_config['display_screen'] if 'episode_length' in map_config: self.nb_frames = map_config['episode_length'] if 'episode_end_sleep' in map_config: self.episode_end_sleep = map_config['episode_end_sleep'] self.current_step = 0 self._seed() self.p = PLE(self.game, fps=self.fps, frame_skip=self.frame_skip, num_steps=self.num_steps, force_fps=self.force_fps, display_screen=self.display_screen, rng=self.rng) self.p.init() self._action_set = self.p.getActionSet()[1:] self.action_space = spaces.Discrete(len(self._action_set)) (screen_width, screen_height) = self.p.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(screen_height, screen_width, 3))
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() score = 0 while nb_episodes > 0: # pick an action action = agent.policy(env.game.getGameState()) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over(): print(score, nb_episodes) env.reset_game() nb_episodes -= 1 if score > agent.highestScore: agent.highestScore = score agent.totalScore += score score = 0
def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frame_skip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng self._hist_size = 1 if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() self._actions = self._ple.getActionSet() self._state_size = self._ple.getGameStateDims()[0] self._state_saved = np.zeros((self._state_size), dtype=np.float32) self.previous_score = 0. self.episode_scores = []
def __init__(self, duration, size=(48, 48)): """ Create a new PuckWorld Environment :param size: Game window dimensions """ super(PuckWorld, self).__init__() self.width, self.height = size self.game = ExtPuckWorld(width=self.width, height=self.height, duration=duration, r_m=self._r_m) self.game.screen = pygame.display.set_mode(self.game.getScreenDims(), 0, 32) self.game.clock = pygame.time.Clock() self.game.rng = np.random.RandomState(24) self.ple = PLE(self.game) self.ple.init() self.epsilon = 2 * self.game.good_creep.radius # Size of epsilon-region around goal state self.terminal = False self.reset()
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) input_layer = tf.Variable( tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE])) output_layer = tf.Variable( tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE])) feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer)) self.logits = tf.matmul(feed_forward, output_layer) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game = game self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() # increase gap for checking #self.game.pipe_gap = 115 #self.game.player.height = 14 self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() #print(self.screen_width, self.screen_height) self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None
def __init__(self, game_name='FlappyBird', display_screen=True, observe_state=False): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen, state_preprocessor=state_preprocessor) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() if self.screen_height + self.screen_width > 500: img_scale = 0.25 else: img_scale = 1.0 self.screen_width = int(self.screen_width * img_scale) self.screen_height = int(self.screen_height * img_scale) self.observe_state = observe_state if self.observe_state: # the bounds are typically not infinity self.observation_space = spaces.Box( low=-float('inf'), high=float('inf'), shape=self.game_state.state_dim) else: self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_height, self.screen_width, 3)) self.viewer = None
def test(): game2 = FlappyBird() p2 = PLE(game2, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p2.init() reward = 0.0 nb_games = 10 cumulated = np.zeros((nb_games)) for i in range(nb_games): p2.reset_game() while (not p2.game_over()): state = game2.getGameState() screen = p2.getScreenRGB() action = FlappyPolicy(state, screen) reward = p2.act(action) cumulated[i] = cumulated[i] + reward return np.mean(cumulated)
def __init__(self, game_name='FlappyBird', display_screen=True, ple_game=True): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib if ple_game: game_module_name = ('ple.games.%s' % game_name).lower() else: game_module_name = game_name.lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() ################################################################## # old one #self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state = PLE(game, fps=30, display_screen=display_screen, state_preprocessor=self.process_state) ################################################################## self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) self.viewer = None
def __init__(self, game_name='FlappyBird', display_screen=True, ple_game=True, root_game_name=None, reward_type='sparse', obs_type=None, **kwargs): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' os.environ['SDL_AUDIODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib if ple_game: game_module_name = ('ple.games.%s' % game_name).lower() else: game_module_name = F"{root_game_name.lower()}.envs" game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)(**kwargs) self.ple_wrapper = PLE(game, fps=30, display_screen=display_screen) self.ple_wrapper.init() game.reward_type = reward_type self._action_set = self.ple_wrapper.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.ple_wrapper.getScreenDims( ) # Assume observation space to be (64, 64, 3) due to procgen self.observation_space = spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8) self.viewer = None assert obs_type is not None, obs_type self.obs_type = obs_type self.reward_range = game.rewards['win']
def watch(): import torch game = FlappyBird() env = PLE(game, fps=30, frame_skip=4, display_screen=True, force_fps=False, reward_values={"tick": 0.00}, state_preprocessor=None) env.init() model = Model(obs_dim=OBS_DIM, act_dim=ACT_DIM) if torch.cuda.is_available(): model = model.cuda() model.load_state_dict(torch.load('checkpoint.pt')) from parl.algorithms.torch import PolicyGradient alg = PolicyGradient(model, LEARNING_RATE) agent = Agent(alg) for i in range(10000): # 1000 episodes obs_list, action_list, reward_list = run_episode(env, agent) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) # agent.learn(batch_obs, batch_action, batch_reward) _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward))
def random_play(episodes = 100): # Initialize game and agent game = FlappyBird() p = PLE(game, display_screen=True, state_preprocessor=process_state) p.init() agent = Agent(p) total_reward = [] # Run given number of episodes for _ in range(episodes): # Initialize episode p.reset_game() total_episode_reward = 0 # Episode loop while not p.game_over(): action = agent.choose_action() reward = p.act(action) total_episode_reward += reward # Save episode reward and return total_reward.append(total_episode_reward) return total_reward
def main(): # 创建环境 game = Pong(width=200, height=200, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) print("act_dim:", act_dim) obs_dim = 200 * 200 # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = PongModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = PongAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) max_episode = 20000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward > best_reward: best_reward = eval_reward agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode)) logger.info('episode:{} test_reward:{}'.format(episode, eval_reward))
def init_flappy_bird(mode, graphics="fixed"): # use "Fancy" for full background, random bird color and random pipe color, # use "Fixed" (default) for black background and constant bird and pipe colors. game = FlappyBird(graphics=graphics) # Set parameters, depending on the mode specified force_fps = (mode == Mode.TRAIN) display_screen = (mode == Mode.PLAY) # Note: if you want to see you agent act in real time, set force_fps to False. # But don't use this setting for learning, just for display purposes. env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=force_fps, display_screen=display_screen) # Init the environment (settings, display...) and reset the game env.init() env.reset_game() return game, env
states = np.array(states) targets = np.array(targets) self.model.fit(states, targets, nb_epoch=1, verbose=0) # 학습하기 if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def load(self, name): # 학습된 네트워크 로드 self.model.load_weights(name) def save(self, name): # 네트워크 저장 self.model.save_weights(name) if __name__ == "__main__": game = Catcher(width=320, height=320) env = PLE(game, display_screen=True, state_preprocessor=process_state) agent = DQNAgent(env) agent.load("./save/catcher.h5") #초기화 #pylab.title("reward") #pylab.xlabel("episodes") #pylab.ylabel("rewards") env.init() scores, time = [], [] for e in range(EPISODES): env.reset_game() state = env.getGameState() state = np.array([list(state[0])]) score = 0