def _test_ple(): from ple.games.pong import Pong from ple.games.flappybird import FlappyBird from ple import PLE # os.environ['SDL_VIDEODsRIVER'] = 'dummy' game = Pong() game = FlappyBird() ple_game = PLE(game, fps=30, display_screen=True) ple_game.init() ALLOWED_ACTIONS = ple_game.getActionSet() print(ALLOWED_ACTIONS) action = 0 start = time() t = 0 while True: ep_reward = 0 ple_game.reset_game() while not ple_game.game_over(): sleep(0.1) t += 1 if t % 15 == 5: action = 0 else: action = 1 reward = ple_game.act(ALLOWED_ACTIONS[action]) # print(reward) ep_reward += reward print(ep_reward, t, t / (time() - start))
def test_movement_up(): game = Pong() p = PLE(game, display_screen=True, fps=20, force_fps=1) p.init() time.sleep(.5) oldState = p.getGameState() p.act(game.actions["up"]) newState = p.getGameState() assert oldState["player_velocity"] > newState["player_velocity"]
def main(train=False): # Don't modify anything in this function. # See the constants defined at the top of this file if you'd like to # change the FPS, screen size, or round length game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE) if train: p = PLE(game, fps=FPS, display_screen=False, force_fps=True) else: p = PLE(game, fps=FPS, display_screen=True, force_fps=False) p.init() agent_rounds = 0 cpu_rounds = 0 agent_score = 0 cpu_score = 0 num_frames = 0 while True: if p.game_over(): if game.score_counts['agent'] > game.score_counts['cpu']: agent_rounds += 1 print('AGENT won round') else: cpu_rounds += 1 print('CPU won round') if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS: break p.reset_game() obs = p.getGameState() action = agent(normalize(obs)) reward = p.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 print('AGENT scored') elif reward < 0: cpu_score += 1 print('CPU scored') num_frames += 1 winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU' print('Winner:', winner) print('Num frames :', num_frames) print('AGENT rounds won:',agent_rounds) print('CPU rounds won:',cpu_rounds) print('AGENT total score:',agent_score) print('CPU total score:',cpu_score)
def __init__(self): self.resize_factor = 0.5 self.width = 64 self.height = 48 self.ple = PLE(game=Pong(), fps=30, frame_skip=8) self.action_set = self.ple.getActionSet() self.action_space = spaces.Discrete(len(self.action_set)) self.observation_space = spaces.Box( low=0.0, high=255.0, shape=( int(self.width * self.resize_factor), int(self.height * self.resize_factor), 1, ), dtype=np.uint32, )
def main(): # 创建环境 game = Pong(width=200, height=200,MAX_SCORE=11) p = PLE(game, fps=30, display_screen=False, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) obs = get_obs(p) obs_dim = 200*200 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.1) # e_greed有一定概率随机选取动作,探索 # # 加载模型 # if os.path.exists('./water_world_dqn.ckpt'): # agent.restore('./water_world_dqn.ckpt') # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=False) # render=True 查看显示效果 if eval_reward>best_reward: best_reward = eval_reward agent.save('model_dir/dqn_pong_{}_reward_{}.ckpt'.format(episode,best_reward)) logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward))
def main(): # 创建环境 game = Pong(width=200, height=200, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=False, force_fps=True) # 根据parl框架构建agent p.reset_game() print(p.getActionSet()) act_dim = len(p.getActionSet()) obs_dim = 200 * 200 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # # 加载模型 # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'): # agent.restore('model_dir/pg_pong_episode_19.ckpt') best_total_reward = -float('inf') for i in range(500000): obs_list, action_list, reward_list = run_episode(p, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 50 == 0: total_reward = evaluate(p, agent, render=False) if total_reward > best_total_reward: best_total_reward = total_reward agent.save( 'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format( i, total_reward)) logger.info('Test reward: {}'.format(total_reward))
def __init__(self, game, display_screen=False): from ple import PLE assert game in [ 'catcher', 'monsterkong', 'flappybird', 'pixelcopter', 'pong', 'puckworld', 'raycastmaze', 'snake', 'waterworld' ] if game == 'catcher': from ple.games.catcher import Catcher env = Catcher() elif game == 'monsterkong': from ple.games.monsterkong import MonsterKong env = MonsterKong() elif game == 'flappybird': from ple.games.flappybird import FlappyBird env = FlappyBird() elif game == 'pixelcopter': from ple.games.pixelcopter import Pixelcopter env = Pixelcopter() elif game == 'pong': from ple.games.pong import Pong env = Pong() elif game == 'puckworld': from ple.games.puckworld import PuckWorld env = PuckWorld() elif game == 'raycastmaze': from ple.games.raycastmaze import RaycastMaze env = RaycastMaze() elif game == 'snake': from ple.games.snake import Snake env = Snake() elif game == 'waterworld': from ple.games.waterworld import WaterWorld env = WaterWorld() self.p = PLE(env, fps=30, display_screen=display_screen) self.action_set = self.p.getActionSet() self.action_size = len(self.action_set) self.screen_dims = self.p.getScreenDims() self.p.init()
def main(): # 创建环境 game = Pong(width=200, height=200, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) print("act_dim:", act_dim) obs_dim = 200 * 200 # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = PongModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = PongAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) max_episode = 20000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward > best_reward: best_reward = eval_reward agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode)) logger.info('episode:{} test_reward:{}'.format(episode, eval_reward))
#coding:utf-8 from ple.games.pong import Pong from ple import PLE import numpy as np def get_obs(env): # game_state = env.getGameState() # obs = list(game_state.values()) obs = env.getScreenGrayscale()/255.0 return obs.astype(np.float).ravel() if __name__ == '__main__': game = Pong(width=128, height=96,MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) p.getScreenGrayscale() game_state = p.getGameState() print(game_state)
def init_main(save_path, model, train=True, display=False): """The application's entry point. If someone executes this module (instead of importing it, for example), this function is called. """ push_to_memory, select_action, perform_action, optimize, save_model = model fps = 30 # fps we want to run at frame_skip = 2 num_steps = 1 force_fps = False # slower speed game = Pong(width=256, height=256) p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display) p.init() def p_action(action): # reward, action return p.act(action) def main(steps): x_t = extract_image(p.getScreenRGB(), (80, 80)) stack_x = np.stack((x_t, x_t, x_t, x_t), axis=0) try: while p.game_over() == False and steps > 0: steps -= 1 x_t = extract_image(p.getScreenRGB(), (80, 80)) x_t = np.reshape(x_t, (1, 80, 80)) st = np.append(stack_x[1:4, :, :], x_t, axis=0) if train: reward, action, _, _, _ = train_and_play( p_action, st, select_action, perform_action, possible_actions, optimize, None, {}) push_to_memory(stack_x, action, st, reward) else: play(p_action, st, select_action, perform_action, possible_actions, None, {}) stack_x = st except KeyboardInterrupt as e: print("KeyboardInterrupt >>", e) print("Saving model") if train: save_model(save_path) print("Model saved") sys.exit() score = p.score() p.reset_game() if train: save_model(save_path) return score return main
def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] fps = 30 # fps we want to run at frame_skip = 2 num_steps = 1 force_fps = False # slower speed display_screen = True reward = 0.0 max_noops = 20 nb_frames = 15000 # make a PLE instance. env = PLE(Pong(), fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) # our Naive agent! agent = NaiveAgent(env.getActionSet()) # init agent and game. env.init() # lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = env.act(env.NOOP)
def test_pong(self): from ple.games.pong import Pong game = Pong() self.run_a_game(game)
def a3c_main(save_path, shared_model,\ model,\ select_action,\ perform_action,\ save_model,\ optimizer=None,\ train=True,\ display=False,\ gamma =.99,\ tau=1.): fps = 30 # fps we want to run at frame_skip = 2 num_steps = 1 force_fps = False # slower speed game = Pong(width=256, height=256) p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display) p.init() def p_action(action): # reward, action return p.act(action) def main(lstm_shape, steps): reward_alive = 0 values = [] log_probs = [] rewards = [] entropies = [] x_t = extract_image(p.getScreenRGB(), (80, 80)) stack_x = np.stack((x_t, x_t, x_t, x_t), axis=0) model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, lstm_shape[-1])) hx = Variable(torch.zeros(1, lstm_shape[-1])) try: while p.game_over() == False and steps > 0: steps -= 1 x_t = extract_image(p.getScreenRGB(), (80, 80)) x_t = np.reshape(x_t, (1, 80, 80)) st = np.append(stack_x[1:4, :, :], x_t, axis=0) if train: # print() reward, action, hx, cx, info_dict = train_and_play(p_action, st,\ select_action, perform_action,\ possible_actions, opt_nothing, \ model, {"isTrain":True, "hx":hx,"cx":cx}) reward_alive += 0.1 reward += reward_alive rewards.append(reward) # reward += r entropies.append(info_dict["entropies"]) values.append(info_dict["values"]) log_probs.append(info_dict["log_probs"]) else: _, _, hx, cx, _ = play(p_action, st, select_action,\ perform_action, possible_actions, model, {"hx":hx,"cx":cx, "isTrain":False}) stack_x = st if train: state = torch.from_numpy(stack_x) R = torch.zeros(1, 1) if steps > 0: value, _, _ = model( (Variable(state.unsqueeze(0).float()), (hx, cx))) values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + gamma * \ values[i + 1].data - values[i].data gae = gae * gamma * tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step() except KeyboardInterrupt as e: print("KeyboardInterrupt >>", e) print("Saving model") if train: save_model(shared_model, save_path) print("Model saved") sys.exit() score = p.score() p.reset_game() if train: save_model(shared_model, save_path) return score return main
def test_invalid_max_score(): with pytest.raises(Exception): game = Pong(MAX_SCORE=-1)
def test_invalid_game_size(): with pytest.raises(Exception): game = Pong(width=-200, height=-200)
def test_negative_ball_speed(): with pytest.raises(Exception): game = Pong(ball_speed_ratio=-1)
def test_negative_player_speed(): with pytest.raises(Exception): game = Pong(players_speed_ratio=-1)
while True: action_index = agent.predict(obs) # 选取最优动作 action = ple_env.getActionSet()[action_index] reward = ple_env.act(action) obs = list(ple_env.getGameState().values()) episode_reward += reward # if render: # ple_env.getScreenRGB() if ple_env.game_over(): break eval_reward.append(episode_reward) return np.mean(eval_reward) # 创建环境 game = Pong(cpu_speed_ratio=0.3) # game = Pong() pong = PLE(game, display_screen=True, force_fps=True) # 根据parl框架构建agent print(pong.getActionSet()) action_dim = len(pong.getActionSet()) obs_shape = len(pong.getGameState()) print(pong.getGameState()) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape,
import numpy as np import time import gym from ple.games.pong import Pong from ple import PLE NET_SIZE = [7] ACTIVATION = relu MAX_STEPS = 800 ITERATIONS = 1 # ENV_NAME = 'BipedalWalker-v2' env = Pong(250, 250) ## Environment initialization # env = gym.make(ENV_NAME) ## Initialize UMDAc umdac = UMDAc(1, NET_SIZE, ACTIVATION, env, max_steps=MAX_STEPS, action_mode='raw', iterations=ITERATIONS) new = umdac.load_specimen('resultPLE.txt')
from ple import PLE from ple.games.pong import Pong import pygame import time import sys game = Pong(width=300, height=200) p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() print(p.getActionSet()) nb_frames = 1000 action = None for f in range(nb_frames): if p.game_over(): p.reset_game() obs = p.getScreenRGB() events = pygame.event.get() for event in events: if event.type == pygame.QUIT: sys.exit() elif event.type == pygame.KEYDOWN: if event.key: action = event.key print(action) elif event.type == pygame.KEYUP: action = None p.act(action)
GENERATIONS = 900 GEN_SIZE = 100 N_SURV = 35 N_RAND_SURV = 15 ITERATIONS = 1 MAX_STEPS = None LOG_NOTES = 'gensize:', str(GEN_SIZE),' , nsurv:', str( N_SURV), ' nrandsurv:', str(N_RAND_SURV) ## Environment initialization #env = FlappyBird() #env = Snake() # env = WaterWorld(250, 250) env = Pong(150, 150) ## Initialize UMDAc umdac = UMDAc(GEN_SIZE, NET_SIZE, ACTIVATION, env, max_steps=MAX_STEPS, iterations=ITERATIONS, display_info=True) ## Reset training data loggers avg_reward_log = [] max_rewards = [] min_rewards = [] last_avg_reward = 0 for i in range(GENERATIONS): ## Reset reward logger
def __init__(self, game_name, rewards, state_as_image = True, fps = 30, force_fps=True, frame_skip=2, hold_action=2, visualize=False, width=84, height=84, lives=1): """ Initialize Pygame Learning Environment https://github.com/ntasfi/PyGame-Learning-Environment Args: env_name: PLE environment fps: frames per second force_fps: False for slower speeds frame_skip: number of env frames to skip hold_action: number of env frames to hold each action for isRGB: get color or greyscale version of statespace #isRGB = False, game_height,game_width: height and width of environment visualize: If set True, the program will visualize the trainings, will slow down training lives: number of lives in game. Game resets on game over (ie lives = 0). only in Catcher and Pong (score) """ self.env_name = game_name self.rewards = rewards self.lives = lives self.state_as_image = state_as_image self.fps = fps #30 # frames per second self.force_fps = force_fps #True # False for slower speeds self.frame_skip = frame_skip # frames to skip self.ple_num_steps = hold_action # frames to continue action for #self.isRGB = isRGB #always returns color, lets tensorforce due the processing self.visualize = visualize self.width = width self.height = height #testing self.reached_terminal = 0 self.episode_time_steps = 0 self.episode_reward = 0 self.total_time_steps = 0 if self.env_name == 'catcher': self.game = Catcher(width=self.width, height=self.height,init_lives=self.lives) elif self.env_name == 'pixelcopter': self.game = Pixelcopter(width=self.width, height=self.height) elif self.env_name == 'pong': self.game = Pong(width=self.width, height=self.height,MAX_SCORE=self.lives) elif self.env_name == 'puckworld': self.game = PuckWorld(width=self.width, height=self.height) elif self.env_name == 'raycastmaze': self.game = RaycastMaze(width=self.width, height=self.height) elif self.env_name == 'snake': self.game = Snake(width=self.width, height=self.height) elif self.env_name == 'waterworld': self.game = WaterWorld(width=self.width, height=self.height) elif self.env_name == 'monsterkong': self.game = MonsterKong() elif self.env_name == 'flappybird': self.game = FlappyBird(width=144, height=256) # limitations on height and width for flappy bird else: raise TensorForceError('Unknown Game Environement.') if self.state_as_image: process_state = None else: #create a preprocessor to read the state dictionary as a numpy array def process_state(state): # ret_value = np.fromiter(state.values(),dtype=float,count=len(state)) ret_value = np.array(list(state.values()), dtype=np.float32) return ret_value # make a PLE instance self.env = PLE(self.game,reward_values=self.rewards,fps=self.fps, frame_skip=self.frame_skip, num_steps=self.ple_num_steps,force_fps=self.force_fps,display_screen=self.visualize, state_preprocessor = process_state) #self.env.init() #self.env.act(self.env.NOOP) #game starts on black screen #self.env.reset_game() #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.reset_game() # setup gamescreen object if state_as_image: w, h = self.env.getScreenDims() self.gamescreen = np.empty((h, w, 3), dtype=np.uint8) else: self.gamescreen = np.empty(self.env.getGameStateDims(), dtype=np.float32) # if isRGB: # self.gamescreen = np.empty((h, w, 3), dtype=np.uint8) # else: # self.gamescreen = np.empty((h, w), dtype=np.uint8) # setup action converter # PLE returns legal action indexes, convert these to just numbers self.action_list = self.env.getActionSet() self.action_list = sorted(self.action_list, key=lambda x: (x is None, x))