def test(): game = Snake(600, 600) p = PLE(game, fps=60, state_preprocessor=process_state, force_fps=True, display_screen=True, frame_skip=2, reward_values={ "positive": 100.0, "negative": -50.0, "tick": -0.1, "loss": -70.0, "win": 5.0 }) agent = Agent(alpha=float(sys.argv[1]), gamma=float(sys.argv[2]), n_actions=3, epsilon=0.01, batch_size=100, input_shape=6, epsilon_dec=0.99999, epsilon_end=0.001, memory_size=500000, file_name=sys.argv[3], activations=[str(sys.argv[4]), str(sys.argv[5])]) p.init() agent.load_game() scores = [] for _ in range(200): if p.game_over(): p.reset_game() apples = 0 initial_direction = "Right" while not p.game_over(): old_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) action = agent.choose_action(old_state) possible_directions = prepare_corect_directions(initial_direction) possible_directions_tuples = list( zip(possible_directions.keys(), possible_directions.values())) direction = possible_directions_tuples[action] initial_direction = direction[1] reward = p.act(direction[0]) if reward > 50.0: apples += reward scores.append(apples) return scores
def __init__(self, *args, **kwargs): super(PleEnvAdapter, self).__init__(*args, **kwargs) if not self.render: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" Game = envs_lookup_table[self.env_name] self.env = PLE(Game(), display_screen=self.render, force_fps=not self.render) self.env.init()
def eval_genomes(genomes, config): done = [False] * len(genomes) pl = [] for i in range(len(genomes)): pl.append(PLE(game, fps=30, display_screen=True, force_fps=False)) pl[i].init() while sum(done) != len(done): if len(pl) < len(genomes): pl.append(PLE(game, fps=30, display_screen=True, force_fps=False)) done = done + [False] pl[-1].init() m = 0 nets = [] gid = [] for i, (genome_id, genome) in enumerate(genomes): net = neat.nn.recurrent.RecurrentNetwork.create(genome, config) nets.append(net) gid.append(genome_id) nnOutput = [0] * len(genomes) rew = [0] * len(genomes) current_max_fitness = [0] * len(genomes) fitness_current = [0] * len(genomes) frame = [0] * len(genomes) counter = [0] * len(genomes) for i in range(len(genomes)): ob = list(np.zeros([288, 512, 3]) * len(genomes)) ob.append(pl[i].getScreenRGB()) frame[i] += 1 ob[i] = cv2.resize(ob[i], (int(ob[i].shape[0]/8), int(ob[i].shape[1]/8))) ob[i] = cv2.cvtColor(ob[i], cv2.COLOR_BGR2GRAY) ob[i] = np.reshape(ob[i], (int(ob[i].shape[0]), int(ob[i].shape[1]))) imgarray = np.ndarray.flatten(ob[i]) nnOutput.append(np.argmax(nets[i].activate(imgarray))) rew[i] = (pl[i].act(119*np.argmax(nnOutput[i]))) done[i] = pl[i].game_over() # check if the game is over fitness_current[i] += float(rew[i]) if fitness_current[i] > current_max_fitness[i]: current_max_fitness[i] = float(fitness_current[i]) counter[i] = 0 else: counter[i] += 1 if sum(done) == len(done): m += config.pop_size - 1 # print(gid[i], fitness_current) for k in range(len(pl)): pl[k].reset_game() print(len(p.population), i+1) p.population[i+1].fitness = float(fitness_current[i])
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.trainable = tf.trainable_variables() self.rewards = []
def __init__(self): env = FlappyBird() self.p = PLE(env, add_noop_action=True) self.p.init() self.win_score = 10. action_space = len(self.p.getActionSet()) state_space = len(self.p.getGameState()) actions = ["up", "nothing"] state_names = list(self.p.getGameState().keys()) Environment.__init__(self, env, action_space, state_space, actions, state_names)
def __init__(self, model, screen=False, forcefps=True): self.model = model self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
def __init__(self, model, screen=False, forcefps=True): self.model = model self.game = Pixelcopter(width=int(48 * 5), height=int(48 * 5)) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
class Env: def __init__(self): # initializing the instance of FlappyBird class self.game = FlappyBird(pipe_gap=100) # then pass this object into PLE constructor and create an instance of that self.env = PLE(self.game, fps=30, display_screen=False) # init does some necessary things under the hood self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary self.action_map = self.env.getActionSet() # function which takes an action def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def run(number_of_episodes): game = FlappyBird(pipe_gap=150) rewards = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(game=game, fps=30, display_screen=True, reward_values=rewards, force_fps=False) # Reset environment at the beginning env.reset_game() score = 0 max_score = 0 episode_number = 1 while number_of_episodes > 0: # Get current state state = BasicQLearningAgent.get_state(env.game.getGameState()) # Select action in state "state" action = basic_q_agent.max_q(state) # After choosing action, get reward """ After choosing action, get reward. PLE environment method act() returns the reward that the agent has accumulated while performing the action. """ reward = env.act(env.getActionSet()[action]) score += reward max_score = max(score, max_score) game_over = env.game_over() if game_over: print("===========================") print("Episode: " + str(episode_number)) print("Score: " + str(score)) print("Max. score: " + str(max_score)) print("===========================\n") # f.write("Score: " + str(score) + "|Max. score: " + str(max_score) + "\n") episode_number += 1 number_of_episodes -= 1 score = 0 env.reset_game()
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=110) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() # [None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def __init__(self, device, display=True): # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } self.env = PLE(FlappyBird(), display_screen=display, reward_values=reward_values) self.device = device self.action_set = self.env.getActionSet() self.frames = []
def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self._get_game_state().shape) self.viewer = None
def test_agent(policy, file_writer=None, test_games=10, step=0): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env.init() test_rewards = [] for _ in range(test_games): env.reset_game() no_op(env) game_rew = 0 while not env.game_over(): state = flappy_game_state(env) action = 119 if policy(state) == 1 else None for _ in range(2): game_rew += env.act(action) test_rewards.append(game_rew) if file_writer is not None: summary = tf.Summary() summary.value.add(tag='test_performance', simple_value=game_rew) file_writer.add_summary(summary, step) file_writer.flush() return test_rewards
class PLEEnv(Env): def __init__(self, game, _id, render=True, reset_done=True, num_steps=100): super().__init__(_id, render, reset_done) self.num_steps = num_steps self.game = game self.start() def start(self): if not self.env_instance: self.env_instance = PLE(self.game, fps=30, display_screen=self.render) self.env_instance.init() def step(self, action): reward = self.env_instance.act(action) obs = self.env_instance.getGameState() done = self.env_instance.game_over() return obs, reward, done def reset(self): self.env_instance.reset_game() obs = self.env_instance.getGameState() return obs def close(self): pass def restart(self): self.close() self.reset()
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState def conv_layer(x, conv, stride=1): return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding='SAME') def pooling(x, k=2, stride=2): return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding='SAME') self.X = tf.placeholder(tf.float32, [None, 80, 80, 4]) self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1)) b_conv1 = tf.Variable(tf.truncated_normal([32], stddev=0.01)) conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4) + b_conv1) pooling1 = pooling(conv1) w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1)) b_conv2 = tf.Variable(tf.truncated_normal([64], stddev=0.01)) conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2) + b_conv2) w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1)) b_conv3 = tf.Variable(tf.truncated_normal([64], stddev=0.01)) conv3 = tf.nn.relu(conv_layer(conv2, w_conv3) + b_conv3) pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int( conv3.shape[3]) conv3 = tf.reshape(conv3, [-1, pulling_size]) w_fc1 = tf.Variable( tf.truncated_normal([pulling_size, 256], stddev=0.1)) b_fc1 = tf.Variable(tf.truncated_normal([256], stddev=0.01)) w_fc2 = tf.Variable(tf.truncated_normal([256, 2], stddev=0.1)) b_fc2 = tf.Variable(tf.truncated_normal([2], stddev=0.01)) fc_1 = tf.nn.relu(tf.matmul(conv3, w_fc1) + b_fc1) self.logits = tf.matmul(fc_1, w_fc2) + b_fc2 self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState def conv_layer(x, conv, stride=1): return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding='SAME') def pooling(x, k=2, stride=2): return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding='SAME') self.X = tf.placeholder(tf.float32, [None, 80, 80, 4]) self.REWARDS = tf.placeholder(tf.float32, (None)) self.ACTIONS = tf.placeholder(tf.int32, (None)) w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1)) conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4)) pooling1 = pooling(conv1) w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1)) conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2)) w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1)) conv3 = tf.nn.relu(conv_layer(conv2, w_conv3)) pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int( conv3.shape[3]) conv3 = tf.reshape(conv3, [-1, pulling_size]) w_fc1 = tf.Variable( tf.truncated_normal([pulling_size, 256], stddev=0.1)) w_fc2 = tf.Variable(tf.truncated_normal([256, 2], stddev=0.1)) fc_1 = tf.nn.relu(tf.matmul(conv3, w_fc1)) self.logits = tf.nn.softmax(tf.matmul(fc_1, w_fc2)) indexes = tf.range(0, tf.shape(self.logits)[0]) * tf.shape( self.logits)[1] + self.ACTIONS responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes) self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def main(): env = PLE(Pixelcopter(), fps=30, display_screen=True, state_preprocessor=None) action_dim = len(env.getActionSet()) obs_shape = len(env.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 30000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward, max_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info( 'episode:{} e_greed:{} test_reward:{} max_reward:{}'.format( episode, agent.e_greed, eval_reward, max_reward)) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def train(self): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ if not os.path.exists(self.name): os.mkdir(self.name) t = threading.Thread(target=self.draw_plots) t.daemon = True t.start() reward_values = self.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 while self.frame_count <= 1000000: # pick an action state1 = env.game.getGameState() action = self.training_policy(state1) # step the environment reward = env.act(env.getActionSet()[action]) # print("reward=%d" % reward) state2 = env.game.getGameState() end = env.game_over( ) or score >= 100 # Stop after reaching 100 pipes self.observe(state1, action, reward, state2, end) # reset the environment if the game is over if end: env.reset_game() score = 0 if self.frame_count % 25000 == 0: print("==========================") print("episodes done: {}".format(self.episode_count)) print("frames done: {}".format(self.frame_count)) self.score() with open("{}/agent.pkl".format(self.name), "wb") as f: pickle.dump((self), f, pickle.HIGHEST_PROTOCOL) print("==========================")
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState def conv_layer(x, conv, stride=1): return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding='SAME') def pooling(x, k=2, stride=2): return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding='SAME') self.X = tf.placeholder(tf.float32, [None, 80, 80, 4]) self.Y = tf.placeholder(tf.float32, [None, output_size]) w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1)) conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4)) pooling1 = pooling(conv1) w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1)) conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2)) w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1)) conv3 = tf.nn.relu(conv_layer(conv2, w_conv3)) pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int( conv3.shape[3]) conv3 = tf.reshape(tf.reshape(conv3, [-1, pulling_size]), [batch_size, 8, 512]) cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512)) self.rnn, self.last_state = tf.nn.dynamic_rnn( inputs=conv3, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) w = tf.Variable(tf.random_normal([512, output_size])) self.logits = tf.matmul(self.rnn[:, -1], w) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def __init__(self, reward_values={}, reward_discount=0.99, pip_gap=100, display_screen=True, fps=30, force_fps=True): self.game = PLE(FlappyBird(pipe_gap=pip_gap), reward_values=reward_values, fps=fps, force_fps=force_fps, display_screen=display_screen) self.game.init() self.actions = self.game.getActionSet() self.reward_discount = reward_discount
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() score = 0 tot_nb_episodes = nb_episodes average = 0 highscore = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead action = agent.policy(agent.state_binner(env.game.getGameState())) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over(): average += score if score > highscore: highscore = score print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("Average for 100 runs {}".format(average / tot_nb_episodes)) return highscore
def determine_fitness(self, individual): """ determine the fitness of the given individual by running a simulation of the game with its encoded agent :param individual: :return: """ if individual.fitness is None: game = FlappyBird() p = PLE(game, fps=30, display_screen=False) p.init() raise NotImplementedError( "add your code to determine the fitness of the individual, " "you can change the signature of this function") else: return individual.fitness
def setup_env_agent(display_screen, frame_skip, force_fps, reward_shaping, frame_stack, train): game = FlappyBird() ple_flappy = PLE(game, fps=30, display_screen=display_screen, frame_skip=frame_skip, force_fps=force_fps) if reward_shaping and train: z = ple_flappy.game.rewards z['tick'] = 0.1 ple_flappy.game.adjustRewards(z) ple_flappy.init() agent = DQNAgent(ple_flappy.getActionSet(), frame_stack=frame_stack) return ple_flappy, agent
def __init__(self, display_screen=True): self.game_state = PLE(AngryBird(render=display_screen), fps=30, display_screen=display_screen) #self.game_state.init() self.display_screen = display_screen self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, \ shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) self.viewer = None
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
def play_flappy_bird(play_game=True, train_agent=False, agent_model_path='model_backup.h5'): game = FlappyBird() environment = PLE(game, fps=30, display_screen=True) # agent_explored_states = FlappyBirdAgent() action_len = 2 states = [] for key, value in game.getGameState().iteritems(): states.append(value) print(states) state_len = len(states) agent_explored_states = FlappyBirdAgent(state_len, action_len) if os.path.exists(agent_model_path): agent_explored_states.load_agent_experience(agent_model_path) agent_explored_states.model_loaded = True print("WEights loaded") # environment.init() if train_agent: agent_explored_states.train(environment, game) print("Trained") if play_game: agent_explored_states.play(environment, game) print("Played")
def __init__(self, game_name='FlappyBird', display_screen=True): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None
def __init__(self, game="pixelcopter", fps=30): os.environ['SDL_VIDEODRIVER'] = 'dummy' self.game_name = game if game == "flappy": engine = FlappyBird() elif game == "pixelcopter": engine = Pixelcopter() else: assert False, "This game is not available" engine.rewards["loss"] = -5 # reward at terminal state self.reward_terminal = -5 self.game = PLE(engine, fps=fps, display_screen=False) self.game.init() self.game.act(0) # Start the game by providing arbitrary key as input self.key_input = self.game.getActionSet() self.reward = 0
def run_a_game(self, game): from ple import PLE p = PLE(game, display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward, obs))
def test_movement_up(): game = Pong() p = PLE(game, display_screen=True, fps=20, force_fps=1) p.init() time.sleep(.5) oldState = p.getGameState() p.act(game.actions["up"]) newState = p.getGameState() assert oldState["player_velocity"] > newState["player_velocity"]
def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119]
def run_a_game(self,game): from ple import PLE p = PLE(game,display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward,obs))
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.00) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet()
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
import numpy as np import pygame from pygame.locals import * class TestAgent(): def __init__(self, actions): self.actions = actions def doAction(self,reward,obs): #print 'hello' for event in pygame.event.get(): if event.type == KEYDOWN: return self.actions[0] return None game = RunningMinion() #game = WaterWorld() p = PLE(game, fps=30, display_screen=True) agent = TestAgent(p.getActionSet()) p.init() reward = 0.0 nb_frames = 2000 for i in range(nb_frames): if p.game_over(): p.reset_game() if i%1==0: obser = p.getScreenRGB() action = agent.doAction(reward,obser) reward = p.act(action)
epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 30000 #decay steps epsilon_min = 0.1 lr = 0.01 discount = 0.95 #discount factor rng = np.random.RandomState(24) #memory settings max_memory_size = 100000 min_memory_size = 1000 #number needed before model training starts epsilon_rate = (epsilon - epsilon_min) / epsilon_steps #PLE takes our game and the state_preprocessor. It will process the state for our agent. game = Catcher(width=128, height=128) env = PLE(game, fps=60, state_preprocessor=nv_state_preprocessor) agent = Agent(env, batch_size, num_frames, frame_skip, lr, discount, rng, optimizer="sgd_nesterov") agent.build_model() memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs+1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False #training loop
import numpy as np from ple import PLE from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000): # training parameters num_epochs = 5 num_steps_train_epoch = num_steps_train_total/num_epochs # steps per epoch of training num_steps_test = 100 update_frequency = 10 # step frequency of model training/updates epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 1000 # decay steps epsilon_min = 0.1 epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # memory settings max_memory_size = 10000 min_memory_size = 60 # number needed before model training starts game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = init_agent(env) memory = utils.ReplayMemory(max_memory_size, min_memory_size) env.init() # Logging configuration and figure plotting logging.basicConfig(filename='../learning.log', filemode='w', level=logging.DEBUG, format='%(levelname)s:%(message)s') logging.info('========================================================') logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n') learning_rewards = [0] testing_rewards = [0] for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train_epoch: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_train_epoch: state = env.getGameState() reward, action = my_agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(my_agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if steps < num_steps_train_epoch: learning_rewards.append(episode_reward) if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)) steps, num_episodes = 0, 0 losses, rewards = [], [] # testing loop while steps < num_steps_test: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_test: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward testing_rewards.append(testing_rewards[-1]+reward) steps += 1 # done watching after 500 steps. if steps > 500: env.display_screen = False if num_episodes % 5 == 0: logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) if steps < num_steps_test: testing_rewards.append(episode_reward) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)) logging.info("Training complete.\n\n") plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total) plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total) save_agent(my_agent, agent_file_path, agent_file_name)
game = RaycastMaze( map_size=6 ) #create our game fps = 30 #fps we want to run at frame_skip = 2 num_steps = 1 force_fps = False #slower speed display_screen = True reward = 0.0 max_noops = 20 nb_frames = 15000 #make a PLE instance. p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) #our Naive agent! agent = NaiveAgent(p.getActionSet()) #init agent and game. p.init() #lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = p.act(p.NOOP) #start our training loop for f in range(nb_frames): #if the game is over if p.game_over():
# env1.reset() # for _ in range(1000): # env.render() # env.step(env.action_space.sample()) # take a random action # env1.render() # env1.step(env1.action_space.sample()) # take a random action # from ple.games.pong import Pong # from ple import PLE # game = Pong() # p = PLE(game, fps=30, display_screen=True, force_fps=False) # p.init() # from ple.games.flappybird import FlappyBird from ple import PLE game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() reward = 0.0 for i in range(nb_frames): if p.game_over(): p.reset_game() observation = p.getScreenRGB() action = agent.pickAction(reward, observation) reward = p.act(action)
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) rewards = {} try: module = importlib.import_module("ple.games.%s" % parameters.game.lower()) game = getattr(module, parameters.game) if parameters.game == "FlappyBird": game = game() elif parameters.game == "WaterWorld": game = game(width=84, height=84, num_creeps=6) else: game = game(width=84, height=84) except: raise ValueError("The game %s could not be found. Try using the classname, it is case sensitive." % parameters.game) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' env = PLE( game, fps=60, force_fps=parameters.force_fps, display_screen=parameters.display_screen, reward_values=rewards, rng=rng ) num_actions = len(env.getActionSet()) if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ple_agent.NeuralAgent(network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) experiment = ple_experiment.PLEExperiment(env, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) env.init() experiment.run()
from ple.games import Doom class NaiveAgent(): """ This is our naive agent. It picks actions at random! """ def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''