class MonsterKongEnv(object): def __init__(self): self.game = MonsterKong() self.p = PLE(self.game, fps=30, display_screen=True) # self.actions = self.p.getActionSet() # self._action_space = list(range(self.actions[0])) # self._action_space.append(self.actions[-1]) self.action_space = self.p.getActionSet() def reset(self): self.p.init() self.p.act(None) # return self.p.getScreenRGB() return self.p.getScreenGrayscale() def step(self, action): reward = self.p.act(self.action_space[action]) # reward = self.p.act(119) # print(self.action_space[action], reward) # return self.p.getScreenRGB(), reward, self.p.game_over() return self.p.getScreenGrayscale(), reward, self.p.game_over() @property def action_space(self): return self._action_space @action_space.setter def action_space(self, action_space): self._action_space = action_space
class PLEEnv(gym.Env): def __init__(self, env_config): game = Catcher(width=screen_wh, height=screen_wh) fps = 30 # fps we want to run at frame_skip = 2 num_steps = 2 force_fps = False # False for slower speed display_screen = True # make a PLE instance. self.env = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) self.env.init() self.action_dict = {0: None, 1: 97, 2: 100} #PLE env starts with black screen self.env.act(self.env.NOOP) self.action_space = Discrete(3) self.k = 4 self.observation_space = spaces.Box(low=0, high=255, shape=(screen_wh, screen_wh, 1 * self.k)) self.frames = deque([], maxlen=self.k) def reset(self): self.env.reset_game() # PLE env starts with black screen, NOOP step to get initial screen self.env.act(self.env.NOOP) ob = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) for _ in range(self.k): self.frames.append(ob) return self._get_ob() def step(self, action): #traditional gym env step #_obs, _rew, done, _info = env.step(env.action_space.sample()) action_value = self.action_dict[action] _rew = self.env.act(action_value) #_obs = self.env.getScreenGrayscale() _obs = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) self.frames.append(_obs) _done = self.env.game_over() _info = {} return self._get_ob(), _rew, _done, _info def _get_ob(self): assert len(self.frames) == self.k return np.concatenate(self.frames, axis=2)
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True, force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") eval_reward = evaluate(agent, penv)
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True,force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape,action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN( model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, penv, rpm) max_episode = 1000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, penv, rpm) episode += 1 eval_reward = evaluate(agent, penv) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) # 训练结束,保存模型 save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward) agent.save(save_path) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def evaluate_step(agent, seed, sess): game = FlappyBird() env = PLE(game, fps=30, display_screen=False, rng=np.random.RandomState(seed)) env.reset_game() env.act(0) # dummy input # grayscale input screen for this episode input_screens = [agent.preprocess(env.getScreenGrayscale())] t = 0 while not env.game_over(): # feed four previous screen, select an action action = agent.select_action(input_screens, sess) # execute the action and get reward reward = env.act(env.getActionSet()[action]) # reward = +1 when pass a pipe, -5 when die # observe the result screen_plum = env.getScreenGrayscale() # get next screen # append grayscale screen for this episode input_screens.append(agent.preprocess(screen_plum)) t+=1 if t >= 1000: # maximum score to prevent run forever break return t
class FlappyBirdEnv(gym.Env): def __init__(self): self.resize_factor = 0.125 self.width = 288 self.height = 512 self.ple = PLE(game=FlappyBird(), fps=30, frame_skip=8) self.action_set = self.ple.getActionSet() self.action_space = spaces.Discrete(len(self.action_set)) self.observation_space = spaces.Box( low=0.0, high=255.0, shape=( int(self.width * self.resize_factor), int(self.height * self.resize_factor), 1, ), dtype=np.uint32, ) self._steps = 0 def reset(self): self._steps = 0 self.ple.display_screen = False self.ple.reset_game() return self._get_state() def step(self, action): self._steps += 1 reward = self.ple.act(self.action_set[action]) next_state = self._get_state() terminal = self.ple.game_over() return next_state, reward, terminal, {} def render(self, mode="human"): self.ple.display_screen = True def _get_state(self): return np.expand_dims(imresize(self.ple.getScreenGrayscale(), self.resize_factor), axis=-1)
def train(self): """Train.""" logs_path = self.args.logs_path video_path = self.args.video_path restore = self.args.restore train = self.args.train # Initial PLE environment os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } # Create FlappyBird game env env = PLE(FlappyBird(), display_screen=False, reward_values=reward_values) # Gets the actions FlappyBird supports action_set = env.getActionSet() replay_buffer = ReplayBuffer(self.hparams.replay_buffer_size) agent = Agent(action_set, self.hparams) # restore model if restore: agent.restore(restore) reward_logs = [] loss_logs = [] for episode in range(1, self.hparams.total_episode + 1): # reset env env.reset_game() env.act(0) obs = convert(env.getScreenGrayscale()) state = np.stack([[obs for _ in range(4)]], axis=0) t_alive = 0 total_reward = 0 if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: agent.stop_epsilon() frames = [env.getScreenRGB()] while not env.game_over(): action = agent.take_action(state) reward = env.act(action_set[action]) if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: frames.append(env.getScreenRGB()) obs = convert(env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) state_new = np.append(state[:, 1:, ...], obs, axis=1) action_onehot = np.zeros(len(action_set)) action_onehot[action] = 1 t_alive += 1 total_reward += reward replay_buffer.append( (state, action_onehot, reward, state_new, env.game_over())) state = state_new # save video if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: os.makedirs(video_path, exist_ok=True) clip = make_video(frames, fps=60).rotate(-90) clip.write_videofile(os.path.join( video_path, 'env_{}.mp4'.format(episode)), fps=60) agent.restore_epsilon() print('Episode: {} t: {} Reward: {:.3f}'.format( episode, t_alive, total_reward)) # danger mp4list = glob.glob('./video_XXX/*.mp4') if len(mp4list) > 0: latest = mp4list[0] latest_timestamp = os.path.getmtime(mp4list[0]) for mp4 in mp4list: ts = os.path.getmtime(mp4) if (ts > latest_timestamp): latest_timestamp = ts latest = mp4 video = io.open(latest, 'r+b').read() encoded = base64.b64encode(video) ipythondisplay.display( HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;"> <source src="data:video/mp4;base64,{0}" type="video/mp4" /> </video>'''.format(encoded.decode('ascii')))) #end danger else: print("Could not find video") if episode > self.hparams.initial_observe_episode and train: # save model if episode % self.hparams.save_logs_frequency == 0: agent.save(episode, logs_path) np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs)) np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs)) # update target network if episode % self.hparams.update_target_frequency == 0: agent.update_target_network() # sample batch from replay buffer batch_state, batch_action, batch_reward, batch_state_new, batch_over = replay_buffer.sample( self.hparams.batch_size) # update policy network loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over) loss_logs.extend([[episode, loss]]) reward_logs.extend([[episode, total_reward]]) # print reward and loss if episode % self.hparams.show_loss_frequency == 0: print( 'Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}'.format( episode, t_alive, total_reward, loss)) agent.update_epsilon()
class DdqnBirdSyr(): def __init__(self, playback_mode, mod=None): self._playback_mode = playback_mode env = FlappyBird(pipe_gap=200) self._ple = PLE(env, fps=30, display_screen=DISPLAY) self._ple.init() self._sess = tf.Session() self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird') self._sess.run(tf.global_variables_initializer()) self._agent.update_target_paras() self._saver = tf.train.Saver() self._replay_buffer = ReplayBuffer(BUFFER_SIZE) self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self._sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.add_variable(tf.Variable(0.), 'maxq') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod)) self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path)) def start(self): for ep in range(MAX_EP): sum_reward = 0 last_state = [] for _ in range(STATE_FRAMES): last_state.append(self._ple.getScreenGrayscale()) last_state = np.dstack(last_state) last_max_qvalue = 0 for step in range(EP_STEPS): time.sleep(0.01) if not step % STATE_FRAMES: q_value = self._agent.predict([last_state])[0] last_max_qvalue = np.max(q_value) act_1_hot = self._explorer.get_action(q_value) act_index = np.argmax(act_1_hot) else: # do nothing act_index = 1 act_1_hot = np.zeros(DIM_ACTION) act_1_hot[act_index] = 1 reward = self._ple.act(self._ple.getActionSet()[act_index]) if reward == 0: reward = 0.1 elif reward == -5: reward = -1 state = np.reshape(self._ple.getScreenGrayscale(), (SCREEN_WIDTH, SCREEN_HEIGHT, 1)) state = np.append(state, last_state[:, :, :3], axis=2) done = False if self._ple.game_over(): done = True self._replay_buffer.add(last_state, act_1_hot, reward, state, done) loss = None if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS: loss = self._train() last_state = state sum_reward += reward self._steps += 1 if done or step == EP_STEPS - 1: print('| Step: %i' % self._steps, '| Episode: %i' % ep, '| Epoch: %i' % step, '| qvalue: %.5f' % last_max_qvalue, '| Sum_Reward: %i' % sum_reward) if loss != None: self.summary.run(feed_dict={ 'loss': loss, 'reward': sum_reward, 'maxq': last_max_qvalue}) self._ple.reset_game() break def _train(self): batch_state, batch_action, batch_reward, batch_state_next, batch_done = \ self._replay_buffer.sample_batch(MINI_BATCH) q_value = self._agent.predict(batch_state_next) max_q_value_index = np.argmax(q_value, axis=1) target_q_value = self._agent.predict_target(batch_state_next) double_q = target_q_value[range(len(target_q_value)), max_q_value_index] batch_y = [] for r, q, d in zip(batch_reward, double_q, batch_done): if d: batch_y.append(r) else: batch_y.append(r + GAMMA * q) opt, loss = self._agent.train(batch_state, batch_action, batch_y) self._agent.update_target_paras() if not self._steps % CKP_STEP: self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps) print('Mod saved!') return loss
def main(args): logs_path = args.logs_path video_path = args.video_path restore = args.restore train = args.train # Initial PLE environment os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } env = PLE(FlappyBird(), fps=30, display_screen=False, reward_values=reward_values) action_set = env.getActionSet() reply_buffer = Reply_Buffer(Config.reply_buffer_size) agent = Agent(action_set) reward_logs = [] loss_logs = [] # restore model if restore: agent.restore(restore) for episode in range(1, Config.total_episode+1): # reset env env.reset_game() env.act(0) obs = convert(env.getScreenGrayscale()) state = np.stack([[obs for _ in range(4)]], axis=0) t_alive = 0 total_reward = 0 if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: agent.stop_epsilon() frames = [env.getScreenRGB()] while not env.game_over(): action = agent.take_action(state) reward = env.act(action_set[action]) if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: frames.append(env.getScreenRGB()) obs = convert(env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) state_new = np.append(state[:, 1:,...], obs, axis=1) action_onehot = np.zeros(len(action_set)) action_onehot[action] = 1 t_alive += 1 total_reward += reward reply_buffer.append((state, action_onehot, reward, state_new, env.game_over())) state = state_new # save video # if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: # os.makedirs(video_path, exist_ok=True) # clip = make_video(frames, fps=60).rotate(-90) # clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60) # agent.restore_epsilon() # print('Episode: {} t: {} Reward: {:.3f}' .format(episode, t_alive, total_reward)) if episode > Config.initial_observe_episode and train: # save model if episode % Config.save_logs_frequency == 0: agent.save(episode, logs_path) np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs)) np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs)) # update target network if episode % Config.update_target_frequency == 0: agent.update_target_network() # sample batch from reply buffer batch_state, batch_action, batch_reward, batch_state_new, batch_over = reply_buffer.sample(Config.batch_size) # update policy network loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over) loss_logs.extend([[episode, loss]]) reward_logs.extend([[episode, total_reward]]) # print reward and loss if episode % Config.show_loss_frequency == 0: print('Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}' .format(episode, t_alive, total_reward, loss)) agent.update_epsilon()
env.init() for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = True # training loop while num_episodes < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False: #and steps < num_steps_train: state = env.getGameState() screen = env.getScreenGrayscale() screen = preprocess(state, screen) # screen = np.reshape(screen,(200,200,1)) # print((screen[0])) #screen = screen[:,np.newaxis] reward, action = agent.act(screen, epsilon=epsilon) memory.add([screen, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max([epsilon_min, epsilon - epsilon_rate]) episode_reward += reward steps += 1
class Environment(): def __init__(self, device, display=True): # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } self.env = PLE(FlappyBird(), display_screen=display, reward_values=reward_values) self.device = device self.action_set = self.env.getActionSet() self.frames = [] def reset(self): self.env.reset_game() def start(self): self.env.act(0) obs = convert(self.env.getScreenGrayscale()) self.state = np.stack([[obs for _ in range(4)]], axis=0) self.t_alive = 0 self.total_reward = 0 return self.state def game_over(self): return self.env.game_over() def getScore(self): return self.env.score() def step(self, action): reward = self.env.act(self.action_set[action]) # make next state obs = convert(self.env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) next_state = np.append(self.state[:, 1:, ...], obs, axis=1) self.t_alive += 1 self.total_reward += reward self.state = next_state return self.state, reward, self.env.game_over() def get_screen(self): return self.env.getScreenRGB() def record(self): self.frames.append(self.env.getScreenRGB()) def saveVideo(self, episode, video_path): os.makedirs(video_path, exist_ok=True) clip = make_video(self.frames, fps=60).rotate(-90) clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60) print('Episode: {} t: {} Reward: {:.3f}'.format( episode, self.t_alive, self.total_reward))
def DeepQLearning(mode, fname = '', epsilon = 1, discount = 0.99): game = FlappyBird() rewards = { 'positive' : 10, 'stick' : 0, 'loss' : -10 } env = PLE(game, fps = 30, display_screen = False, reward_values = rewards) env.init() if mode == 'new': model = build_model(env) elif mode == 'retrain': model = load_model(fname) #parameters actions = env.getActionSet() print(actions) nA = len(env.getActionSet()) final_epsilon = 0.1 epsilon_decay = nth_root(NUMBER_EPISODES, final_epsilon/epsilon) print("=========== Start Training ===========\n") avg_score = [] score = 0 for i in range(1, NUMBER_EPISODES): epsilon = epsilon*epsilon_decay action_reward = [] if (i % 10000 == 0): avg = mean(avg_score) model.save("/content/drive/My Drive/"+'episode_{}_AvgScore_{}.h5'.format(i, avg)) avg_score.clear() print("\nEpisode_{}_AvgScore_{}.hdf5 Saved !".format(i, avg)) for t in itertools.count(): #approx next action state = img_as_float(resize(env.getScreenGrayscale(), (80,80))) state = state.reshape((80, 80, 1)) action_index = epsilon_greedy_policy(model, state, nA, epsilon) action = actions[action_index] reward = env.act(action) next_state = img_as_float(resize(env.getScreenGrayscale(), (80,80))) next_state = next_state.reshape((80, 80, 1)) score += reward done = env.game_over() #action_reward.append((action, reward)) if len(MEMORY_BUFFER) == MEMORY_BUFFER_SIZE: MEMORY_BUFFER.pop(0) MEMORY_BUFFER.append((state, action_index, reward, next_state, done)) experience_replay(env, model, discount) if env.game_over(): break env.reset_game() avg_score.append(score) #print(action_reward) print("\nEpisode {}/{} ---- Score : {}".format(i,NUMBER_EPISODES, score)) score = 0 with open("MEMORY_BUFFER.txt", "wb") as fp: pickle.dump("/content/drive/My Drive/"+MEMORY_BUFFER, fp) return model
game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=False) agent = Agent(allowed_actions=p.getActionSet(), channels=1, learning_rate=0.0085) try: agent.model.load_state_dict(load('memento_movement.pt')) except EOFError: print("Error loading the saved model state") p.init() nb_frames = 10000000 rewards = [] episode = [] old_observation = preprocessing(p.getScreenGrayscale()) movement_captioning = 0 for i in range(nb_frames): if p.game_over(): p.reset_game() preprocessed_observation = preprocessing(p.getScreenGrayscale()) # Forward action, log_action = agent.pickAction(preprocessed_observation-old_observation) if movement_captioning < 2: old_observation = preprocessed_observation movement_captioning += 1 elif movement_captioning < 15: movement_captioning += 1 else: movement_captioning = 0 reward_action = p.act(action)
def DeepQLearning(epsilon=1, discount=0.99): game = FlappyBird() rewards = {'positive': 1, 'stick': 0, 'loss': 0} env = PLE(game, fps=30, display_screen=True, reward_values=rewards) env.init() model_selection = build_model(env) model_evaluation = bulid_model(env) MEMORY_BUFFERS = [MEMORY_BUFFER_EVALUATION, MEMORY_BUFFER_SELECTION] models = [model_evaluation, model_selection] #parameters actions = env.getActionSet() print(actions) nA = len(env.getActionSet()) final_epsilon = 0.001 epsilon_decay = nth_root(NUMBER_EPISODES, final_epsilon / epsilon) print("=========== Start Training ===========\n") for i in range(1, NUMBER_EPISODES): epsilon = epsilon * epsilon_decay score = 0 avg_score = [] #cdaction_reward = [] if (i % 1000 == 0): avg = np.mean(np.asarray(avg_score)) model.save_weights('episode_{}_AvgScore_{}.hdf5'.format(i, avg)) avg_score.clear() print("\nEpisode_{}_AvgScore_{}.hdf5 Saved !".format(i, avg)) for t in itertools.count(): #appro next action state = img_as_float(resize(env.getScreenGrayscale(), (64, 64))) state = state.reshape((64, 64, 1)) action_index = epsilon_greedy_policy(model, state, nA, epsilon) action = actions[action_index] reward = env.act(action) next_state = img_as_float( resize(env.getScreenGrayscale(), (64, 64))) next_state = next_state.reshape((64, 64, 1)) score += reward done = env.game_over() avg_score.append(score) #action_reward.append((action, reward)) if not env.game_over(): reward += discount * np.max(approximation(model, next_state)) if len(MEMORY_BUFFER) == MEMORY_BUFFER_SIZE: MEMORY_BUFFER.pop(0) model_choice = np.random.choice(np.array([0, 1])) MEMORY_BUFFERS[model_choice].append( (state, action_index, reward, next_state, done)) if env.game_over(): break env.reset_game() experience_replay(env, models[model_choice], discount) #print(action_reward) if i % 100 == 0: print("\nEpisode {}/{} ---- Score : {}".format( i, NUMBER_EPISODES, score)) return model
env.act(119) new_action = 119 else: env.act(None) new_action = None else: action = model.predict([prev_obs.reshape(80, 80, 1)])[0] new_action = np.argmax(action) if new_action == 0: new_action = 119 env.act(119) else: new_action = None env.act(None) choices.append(new_action) new_observation = cv2.resize(env.getScreenGrayscale(), (80, 80)) prev_obs = new_observation game_memory.append([new_observation, new_action]) score = env.score() if env.game_over(): break env.reset_game() scores.append(score) if score >= score_requirement: accepted_scores.append(score) print('Average Score:', sum(scores) / len(scores)) print('Success rate:', len(accepted_scores) / len(scores)) print(score_requirement)
from ple import PLE import numpy as np def get_obs(env): # game_state = env.getGameState() # obs = list(game_state.values()) """ 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """ # image = env.getScreenRGB() # image = image[35:195] # 裁剪 # image = image[::2, ::2, 0] # 下采样,缩放2倍 # image[image == 144] = 0 # 擦除背景 (background type 1) # image[image == 109] = 0 # 擦除背景 (background type 2) # image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色 obs = env.getScreenGrayscale() / 255.0 return obs.astype(np.float).ravel() if __name__ == '__main__': game = Pong(width=128, height=128, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) #obs = p.getScreenRGB() obs = p.getScreenGrayscale() print(obs) print(obs.shape) act_dim = len(p.getActionSet()) game_state = p.getGameState() print(game_state)
class MyEnv(Environment): VALIDATION_MODE = 0 memSize = 4 # original size is 288x512 so dividing dividing_factor = 8 width = 288 // dividing_factor height = 512 // dividing_factor def __init__(self, rng, game=None, frame_skip=4, ple_options={ "display_screen": True, "force_fps": True, "fps": 30 }): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frame_skip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((w, h), dtype=np.uint8) self._reduced_screen = np.empty((self.width, self.height), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 print("Dead at score {}".format(self._ple.game.getScore())) self._ple.reset_game() # for _ in range(self._random_state.randint(15)): # self._ple.act(self._ple.NOOP) # self._screen = self._ple.getScreenGrayscale() # cv2.resize(self._screen, (48, 48), # self._reduced_screen, # interpolation=cv2.INTER_NEAREST) return [self.memSize * [self.width * [self.height * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frame_skip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() self._reduced_screen = cv2.resize(self._screen, (self.height, self.width), interpolation=cv2.INTER_NEAREST) cv2.imshow("debug", self._reduced_screen.T) cv2.waitKey(1) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 mean = (self._mode_score / self._mode_episode_count if self._mode_episode_count else "N/A") print("== Mean score per episode is {} over {} episodes ==".format( mean, self._mode_episode_count)) def inputDimensions(self): return [(self.memSize, self.width, self.height)] def observationType(self, subject): return np.float32 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reduced_screen) / 256.] def inTerminalState(self): return self._ple.game_over()
podium = 0 podium_index = 0 background = 0 bird_pixels = None ok = False city_pixels = None city_index = None ok2 = False for i in range(nb_frames): if p.game_over(): p.reset_game() ok = True img = p.getScreenGrayscale().transpose() if ok == True: ok2 = True ok = False if i == 1 or ok2 == True: ok2 = False bincount_vector = np.bincount(img[0]) background = np.argmax(bincount_vector) podium, podium_index = compute_podium(img) #bird_pixels = get_bird_pixels(img, background) city_pixels, city_index = get_city_pixels(img, background, podium_index) transform_image(img, background, podium, podium_index + 1, city_index, city_pixels)
class Bot(): """ This is our Test agent. It's gonna pick some actions after training! """ def __init__(self, lr): self.lr = lr self.game = Pixelcopter(width=480, height=480) self.p = PLE(self.game, fps=60, display_screen=True) self.actions = self.p.getActionSet() #def pickAction(self, reward, obs): # return random.choice(self.actions) def frame_step(self, act_inp): terminal = False reward = self.p.act(act_inp) if self.p.game_over(): self.p.reset_game() terminal = True reward = -1 else: reward = 1 self.score = self.p.score() img = self.p.getScreenGrayscale() img = transform.resize(img, (80, 80)) img = exposure.rescale_intensity(img, out_range=(0, 255)) img = img / 255.0 return img, reward, terminal def build_model(self): print("Building the model..") model = Sequential() model.add( Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same', input_shape=(img_rows, img_cols, img_channels))) #80*80*4 model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same')) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same')) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(2)) adam = Adam(lr=self.lr) model.compile(loss='mse', optimizer=adam) self.model = model print("Finished building the model..") def trainNetwork(self, mode): D = deque() x_t, r_0, terminal = self.frame_step(self.actions[1]) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #need to reshape for keras s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if mode == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") self.model.load_weights("model.h5") adam = Adam(lr=self.lr) self.model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON t = 0 while (True): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 #choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(num_actions) chosen_act = self.actions[action_index] else: q = self.model.predict( s_t) #input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q chosen_act = self.actions[action_index] #We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #run the selected action and observed next state and reward x_t1, r_t, terminal = self.frame_step(chosen_act) x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1 s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() #only train if done observing if t > OBSERVE: #sample a minibatch to train on minibatch = random.sample(D, BATCH) #Now we do the experience replay state_t, action_t, reward_t, state_t1, terminal = zip( *minibatch) state_t = np.concatenate(state_t) state_t1 = np.concatenate(state_t1) targets = self.model.predict(state_t) Q_sa = self.model.predict(state_t1) targets[range(BATCH), action_t] = reward_t + GAMMA * np.max( Q_sa, axis=1) * np.invert(terminal) loss += self.model.train_on_batch(state_t, targets) s_t = s_t1 t = t + 1 # save progress every 10000 iterations if t % 1000 == 0: print("Now we save model") self.model.save_weights("model.h5", overwrite=True) with open("model.json", "w") as outfile: json.dump(self.model.to_json(), outfile) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss) print("Episode finished!") print("************************") def playGame(self, mode): self.build_model() self.trainNetwork(mode) def main(self): modes = ["Train", "Run"] mode = modes[input("Do you wanna Train(0) or Run(1): ")] self.playGame(mode)
class Catcher3: """ Environment Specifications: Short summary: Player controls paddle and gains points for catching apples that fall from the sky; loses points and lives otherwise. Number of Actions = 3 (move left, do nothing, move right) Observation Dimension = 4 (paddle x-position, paddle velocity, apple x-position, apple y-position) Observation Dtype = np.float64 Reward = 1, if paddle touches apple -1, if apple touches floor (and -1 life) -5, if out of lives 0, on any other transition Summary Name: steps_per_episode, reward_per_step """ def __init__(self, config, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_episode_length int 500000 The max number of steps executed in an episoe before forcing a time out norm_state bool True Normalize the state to [-1,1] display bool False Whether to display the screen of the game init_lives int 3 Number of lives at the start of the game store_summary bool False Whether to store the summary of the environment number_of_steps int 500000 Total number of environment steps """ check_attribute(config, 'current_step', 0) self.config = config # environment parameters self.max_episode_length = check_attribute(config, 'max_episode_length', default_value=500000) self.norm_state = check_attribute(config, 'norm_state', default_value=True) self.display = False self.init_lives = 3 # self.display = check_attribute(config, 'display', default_value=False) # self.init_lives = check_attribute(config, 'init_lives', default_value=3) # summary parameters self.store_summary = check_attribute(config, 'store_summary', default_value=False) self.summary = summary self.number_of_steps = check_attribute(config, 'number_of_steps', 500000) if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) # setting up original catcher environment with the specified parameters self.catcherOb = Catcher(init_lives=self.init_lives) if not self.display: # do not open a pygame window os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" if self.norm_state: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob_normalize, display_screen=self.display) else: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob, display_screen=self.display) self.pOb.init() # environment internal state self.actions = [ 97, None, 100 ] # self.pOb.getActionSet() (left = 97, do nothing = None, right = 100) self.num_action = 3 self.num_state = 4 self.episode_step_count = 0 self.pOb.reset_game() self.current_state = self.pOb.getGameState() def _get_image(self): """return a np array with shape = [64, 64, 3]""" return self.pOb.getScreenGrayscale() def setseed(self, value): self.pOb.rng.seed(value) return 0 def reset(self): if self.store_summary: self.summary["steps_per_episode"].append(self.episode_step_count) self.pOb.reset_game() self.episode_step_count = 0 self.current_state = self.pOb.getGameState() return self.current_state def step(self, a): self.config.current_step += 1 self.episode_step_count += 1 reward = self.pOb.act(self.actions[a]) if self.store_summary: self.reward_per_step[self.config.current_step - 1] += reward terminate = self.pOb.game_over() self.current_state = self.pOb.getGameState() timeout = bool(self.episode_step_count >= self.max_episode_length or self.config.current_step >= self.number_of_steps) return self.current_state, reward, terminate, timeout def get_current_state(self): return self.current_state def close(self): return
OBSERVATIONS = 300 #reward_discount = 0.99 time_per_episode = 1000 game = Pixelcopter(img_size,img_size) env = PLE(game) action_size = 2 score_mean = np.zeros(EPISODES//10) score_std = np.zeros(EPISODES//10) score_last10 = [] training_count = 0 plt.figure() max_score = 0 for e in range(EPISODES): env.init() state = process(env.getScreenGrayscale()) for time in range(time_per_episode): # Set actions if time < 3: action = act_dict_decode[0] else: action_input = np.concatenate((state, memory[-1][0], memory[-2][0], memory[-3][0]), axis=3) action = act(action_input) reward = env.act(action) # get reward from action next_state = process(env.getScreenGrayscale()) #@ next state done = env.game_over() # check game over and reassign reward if reward >= 0: reward = 1
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''
class SnakeQNetwork: def __init__(self, food_reward=10, dead_reward=-10, alive_reward=2, discount_factor=0.95, batch_size=10, train_epochs=100, history_size=1000, history_sample_size=50): self.food_reward = food_reward self.dead_reward = dead_reward self.alive_reward = alive_reward self.discount_factor = discount_factor self.batch_size = batch_size self.train_epochs = train_epochs self.history_size = history_size self.history_sample_size = history_sample_size self.q_learning_history = QLearningHistory(history_size) self.exploration_factor = 0.2 self.next_move_prediction = None self.is_neural_network_initialized = False pygame.init() self.game = Snake(width=64, height=64) self.env = PLE(self.game, display_screen=True) self.env.init() self.LOG = gym.logger def run(self, maximum_number_of_iterations=10000, learning_rate=0.5, training=False): for iteration in range(0, maximum_number_of_iterations): if not self.is_neural_network_initialized: self.___initialize_neural_newtork() self.is_neural_network_initialized = True observation = self.env.getScreenGrayscale() observation_width = self.env.getScreenDims()[0] observation_height = self.env.getScreenDims()[1] self.game.init() # exit the while loop only if it's GAME OVER while True: q_values = self.next_move_prediction.predict( x=observation.reshape( 1, observation_width * observation_height), batch_size=1) best_snake_action = np.argmax(q_values) reward = self.__take_snake_action(best_snake_action) previous_observation = copy.deepcopy(observation) observation = self.env.getScreenGrayscale() is_game_over = self.env.game_over() self.LOG.info( "Current action reward: {r}. Is game over: {d}".format( r=reward, d=is_game_over)) if training: reward = self.__get_custom_reward(reward) self.q_learning_history.record_event( state=previous_observation, action=best_snake_action, reward=reward, new_state=observation) last_event = self.q_learning_history.get_last_event() self.LOG.info( "Added event #{n} to history. Action: {a}; Reward: {r}" .format(a=last_event[1], r=reward, n=self.q_learning_history.size)) if self.q_learning_history.is_full(): history_batch = random.sample( self.q_learning_history.get_events(), self.history_sample_size) self.LOG.info( "Sampling {n} events from history.".format( n=self.history_sample_size)) training_batch_data = [] training_batch_labels = [] for history_event in history_batch: old_state, action, reward, new_state = history_event q_values_before_action = self.next_move_prediction.predict( x=old_state.reshape( 1, observation_width * observation_height), batch_size=1) q_values_after_action = self.next_move_prediction.predict( x=new_state.reshape( 1, observation_width * observation_height), batch_size=1) best_q_value_after_action = np.argmax( q_values_after_action) training_q_values = np.zeros((1, 4)) for value_idx in range( 0, len(q_values_before_action)): training_q_values[ value_idx] = q_values_before_action[ value_idx] output_update = learning_rate * ( reward + (self.discount_factor * best_q_value_after_action)) training_q_values[0][:] = 0 training_q_values[0][action] = output_update training_batch_data.append( old_state.reshape( observation_width * observation_height, )) training_batch_labels.append( training_q_values.reshape(4, )) training_batch_data = np.array(training_batch_data) training_batch_labels = np.array(training_batch_labels) self.next_move_prediction.fit( x=training_batch_data, y=training_batch_labels, epochs=self.train_epochs, batch_size=self.batch_size) if is_game_over: break if self.exploration_factor > 0.1: self.exploration_factor -= (1.0 / maximum_number_of_iterations) self.LOG.info( "Exploration factor updated! New value: {v}".format( v=self.exploration_factor)) def ___initialize_neural_newtork(self): input_layer_size = self.env.getScreenDims( )[0] * self.env.getScreenDims()[1] hidden_layer_size = 100 output_layer_size = 4 input_layer = Dense(kernel_initializer='lecun_uniform', units=hidden_layer_size, input_shape=(input_layer_size, ), activation='sigmoid') hidden_layer = Dense(kernel_initializer='lecun_uniform', units=output_layer_size, activation='linear') self.next_move_prediction = Sequential() self.next_move_prediction.add(input_layer) self.next_move_prediction.add(hidden_layer) self.next_move_prediction.compile(optimizer='rmsprop', loss='mean_squared_error') def __take_snake_action(self, snake_action): random_number = np.random.random_sample() if not self.q_learning_history.is_full(): snake_action = random.choice(self.env.getActionSet()) self.LOG.info("Snake chose to do a random move - add to qHistory!") return self.env.act(snake_action) elif random_number < self.exploration_factor: snake_action = random.choice(self.env.getActionSet()) self.LOG.info( "Random number is smaller than exploration factor, {r} < {ef}! Snake chose random move!" .format(r=random_number, ef=self.exploration_factor)) return self.env.act(snake_action) elif snake_action == 0: self.LOG.info("Snake chose to go up") return self.env.act(115) elif snake_action == 1: self.LOG.info("Snake chose to go left") return self.env.act(97) elif snake_action == 2: self.LOG.info("Snake chose to go down") return self.env.act(119) elif snake_action == 3: self.LOG.info("Snake chose to go right") return self.env.act(100) def __get_custom_reward(self, reward): if reward >= 1: self.LOG.info( "Has eaten food! Reward is {r}".format(r=self.food_reward)) return self.food_reward elif reward >= 0: self.LOG.info( "Stayed alive! Reward is {r}".format(r=self.alive_reward)) return self.alive_reward else: self.LOG.info("Crashed! Reward is {r}".format(r=self.dead_reward)) return self.dead_reward
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={ "display_screen": True, "force_fps": True, "fps": 30 }): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frame_skip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reduced_screen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frame_skip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reduced_screen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format( self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.float32 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reduced_screen) / 256.] def inTerminalState(self): return self._ple.game_over()
class FlappyBirdEnv: def __init__(self): self.fps = 30 self.game = flappyBird() self.env = PLE(self.game, fps=self.fps, display_screen=False) # environment interface to game self.env.reset_game() def reset(self, is_show = False): self.env = PLE(self.game, fps=self.fps, display_screen=is_show) # environment interface to game self.env.reset_game() state = self.get_state() return state def act(self, action): # return state_prime, reward, done, info reward = self.env.act(self.env.getActionSet()[action]) # print(reward) # Survive reward +1 # reward += 1 # Get closer to the middle of top and bottom pipe and get more reward # state = self.game.getGameState() # next_dis_to_mid = abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y']) # print('State') # print(state) # print('Mid') # print((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2) # print('next_dis_to_mid') # print(abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y'])) # next_next_dis_to_mid = abs((state['next_next_pipe_top_y'] + state['next_next_pipe_bottom_y']) / 2 - state['player_y']) # dis_reward_coef = 0.01 # reward += dis_reward_coef * ((-next_dis_to_mid) + 0.5 * (-next_next_dis_to_mid)) state_prime = self.get_state() is_done = self.is_over() info = "" return state_prime, reward, is_done, info def get_num_actions(self): return len(self.env.getActionSet()) def get_action_set(self): return self.env.getActionSet() def get_screen_rgb(self): return self.env.getScreenRGB() def get_screen_gray(self): return self.env.getScreenGrayscale() def get_num_state_features(self): return len(self.game.getGameState()) def get_state(self): # dict # * player y position. # * players velocity. # * next pipe distance to player # * next pipe top y position # * next pipe bottom y position # * next next pipe distance to player # * next next pipe top y position # * next next pipe bottom y position # state = { # "player_y": self.player.pos_y, # "player_vel": self.player.vel, # "next_pipe_dist_to_player": next_pipe.x - self.player.pos_x, # "next_pipe_top_y": next_pipe.gap_start, # "next_pipe_bottom_y": next_pipe.gap_start+self.pipe_gap, # "next_next_pipe_dist_to_player": next_next_pipe.x - self.player.pos_x, # "next_next_pipe_top_y": next_next_pipe.gap_start, # "next_next_pipe_bottom_y": next_next_pipe.gap_start+self.pipe_gap # } state = self.game.getGameState() state['next_pipe_top_y'] -= state['player_y'] state['next_pipe_bottom_y'] -= state['player_y'] state['next_next_pipe_top_y'] -= state['player_y'] state['next_next_pipe_bottom_y'] -= state['player_y'] return list(state.values()) def is_over(self): return self.env.game_over()
class FlappyBirdEnv: """ This is the Reinforcement Learning Environment that wraps the PLE Flappy Bird Game. The RL agent interacts with the environment by providing which action it wants to take in the current state. The environment in turn provides the reward and the next state to agent after executing the provided action. """ def __init__(self, display=False): """ Initializes a new environment for FlappyBird game. """ game = game = FlappyBird() self._game = PLE(game, fps=30, display_screen=display) # _display_game flag controls whether or not to render the state that is being provided by the # environment. self._display_game = display if self._display_game: self._display = self.show_img() # display sets up a cv2 window where the current state is displayed. self._display.__next__() # iterate over the display generator. self.NUM_ACTIONS = len(self._game.getActionSet()) # defines the number of action agent can take in the environment. self._ACTION_MAP = {} for i, action in enumerate(self._game.getActionSet()): self._ACTION_MAP[i] = action # Number contiguous images the environment provides as state. Basically at any time, the # environment provides a stack of last 4 (including the current) images as the state to the agent. self._IMAGE_STACK_SIZE = 4 # Dimension of the (greyscale) image provided as state. self._PROCESSED_IMAGE_SIZE = 84 # Determines the number of times the provided action is executed before returning the next # state. self._SKIP_FRAMES = 4 # Used by the RL agent to set up it's CNN model. self.STATE_SPACE = (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE, self._IMAGE_STACK_SIZE) self._init_states() def _init_states(self): """ Initializes/Resets the states for the environment. """ self._image_stack = None # holds the current state, i.e., stack of 4 images. self._score = 0 def step(self, action): """ Provides the next state and rewards after executing the provided action. Args ------ `action` (int): Action to be taken from the current state. """ reward = 0 for i in range(self._SKIP_FRAMES): reward += self._game.act(self._ACTION_MAP[action]) done = self._game.game_over() self._score += reward clipped_reward = self._clip_reward(reward) self.grab_screen() if self._display_game: self._display.send(self._image_stack) # display image on the screen return (self._image_stack.copy(), clipped_reward, done, self._score) def _clip_reward(self, reward): """ Clips the provided reward between [-1, 1] Args ---- `reward` (float): The reward that is to be clipped. Returns ------- A float represent the clipped reward. """ if reward > 1.0: reward = 1.0 elif reward < -1.0: reward = -1.0 return reward def reset(self): """ Resets the game and provides the starting state. Returns ------- A numpy `_IMAGE_STACK_SIZE`-d numpy array (or greyscale image) representing the current state of the environment """ self._game.reset_game() self._init_states() self.grab_screen() if self._display_game: self._display.send(self._image_stack) return self._image_stack.copy() def show_img(self): ''' Show current state (`_IMAGE_STACK_SIZE` greyscale images) in an opencv window. Returns ------- A generator that to which the images can be sent for displaying. ''' return utils.show_image('Model Input (4 images)') def grab_screen(self): """ Grabs 1 to _IMAGE_STACK_SIZE images (depending upon whether called after reseting or not) and adds it to the image_stack in chronological order, i.e., most recent image is the last. """ if self._image_stack is None: self._image_stack = np.zeros(self.STATE_SPACE, dtype=np.uint8) for i in range(self._IMAGE_STACK_SIZE): self._game.act(None) self._image_stack[:, :, i] = self.get_processed_image() else: self._image_stack[:, :, :self._IMAGE_STACK_SIZE-1] = self._image_stack[:, :, 1:] self._image_stack[:, :, self._IMAGE_STACK_SIZE-1] = self.get_processed_image() def get_processed_image(self): """ Fetches the current gameplay screenshot and processes it. Returns ------- A processed greyscale image (as numpy array) representing the current gameplay state. """ screen = self._game.getScreenGrayscale() image = self.process_image(screen) return image def process_image(self, image): """ Processes the input image by performing following steps: i. Cropping and transposing the image to obtain the Region Of Interest (ROI) ii. Resizing the ROI to (`_PROCESSED_IMAGE_SIZE`, `_PROCESSED_IMAGE_SIZE`) dimension. Args ---- `image` (numpy array): The image which is to be processed. Returns ------- A processed greyscale image (as numpy array). """ # Step 1. image = image[:, :410] image = np.transpose(image, (1, 0)) # Step 2. image = cv2.resize(image, (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE), interpolation=cv2.INTER_AREA) return image
# use lower fps so we can see whats happening a little easier game = WaterWorld(width=100, height=100, num_creeps=15) # p = PLE(game, reward_values=rewards) p = PLE(game, fps=30, force_fps=False, display_screen=True, reward_values=rewards) p.init() actions = p.getActionSet()[:-1] agent = Agent(len(actions)) epochs = 10000000 game_duration = 1000 for epoch in range(epochs): p.reset_game() for it in range(1000): if p.game_over(): p.reset_game() print "Finished with score:" + str(p.score()) current_state = np.array(p.getScreenGrayscale()).reshape((10000, )) action = agent.act(np.array([current_state])) # action = actions[np.random.randint(0, len(actions))] reward = p.act(actions[action]) print "Current score: " + str(p.score()) print "Finished with score:" + str(p.score())
#coding:utf-8 from ple.games.pong import Pong from ple import PLE import numpy as np def get_obs(env): # game_state = env.getGameState() # obs = list(game_state.values()) obs = env.getScreenGrayscale()/255.0 return obs.astype(np.float).ravel() if __name__ == '__main__': game = Pong(width=128, height=96,MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) p.getScreenGrayscale() game_state = p.getGameState() print(game_state)
class Bot(): """ This is our Test agent. It's gonna pick some actions after training! """ def __init__(self, lr): self.lr = lr self.game = Pixelcopter(width=480, height=480) self.p = PLE(self.game, fps=60, display_screen=True) self.actions = self.p.getActionSet() #def pickAction(self, reward, obs): # return random.choice(self.actions) def frame_step(act_inp): terminal = False reward = self.p.act(act_inp) if self.p.game_over(): self.p.reset_game() terminal = True reward = -1 else: reward = 1 self.score = self.p.getScore() img = self.p.getScreenGrayscale() img = transform.resize(img, (80, 80)) img = np.ravel(exposure.rescale_intensity(img, out_range=(0, 255))) return img, reward, terminal def build_model(self): print("Building the model..") model = Sequential() model.add( Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same', input_shape=(img_rows, img_cols, img_channels))) #80*80*4 model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same')) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same')) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(2)) adam = Adam(lr=self.lr) model.compile(loss='mse', optimizer=adam) self.model = model print("Finished building the model..") def trainNetwork(self, mode): D = deque() x_t, r_0, terminal = self.frame_step(self.actions[0]) x_t = x_t / 255.0 s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #need to reshape for keras s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if mode == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") self.model.load_weights("model.h5") adam = Adam(lr=self.lr) self.model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON