Esempio n. 1
0
class WaterWorld:
    def __init__(self, fps=30, display_screen=False):
        game = PyGameWaterWorld()
        self.game = PLE(game, fps=fps, display_screen=display_screen)
        action_set = self.game.getActionSet()
        self.action_map = {i: a for (i, a) in enumerate(action_set)}
        self.action_space = spaces.Discrete(len(self.action_map))
        self.metadata = {'render.modes': ['human', 'rgb_array']}

        box = np.ones((48, 48, 3), dtype='float32')
        self.observation_space = spaces.Box(low=box * 0, high=box * 255)

    def reset(self):
        self.game.reset_game()
        return self.game.getScreenRGB()

    def step(self, action):
        a = self.action_map[action]
        r = self.game.act(a)
        done = self.game.game_over()
        info = {}
        return self.game.getScreenRGB(), r, done, info

    def close(self):
        pass
Esempio n. 2
0
class SnakeEnv(object):

    def __init__(self):
        self.game = Snake()
        self.p = PLE(self.game, fps=30, display_screen=True)

        # self.actions = self.p.getActionSet()
        # self._action_space = list(range(self.actions[0]))
        # self._action_space.append(self.actions[-1])
        self.action_space = self.p.getActionSet()

    def reset(self):
        self.p.init()
        self.p.act(None)
        return self.p.getScreenRGB()
        # return self.p.getScreenGrayscale()

    def step(self, action):
        reward = self.p.act(self.action_space[action])
        # reward = self.p.act(119)
        # print(self.action_space[action], reward)
        return self.p.getScreenRGB(), reward, self.p.game_over()
        # return self.p.getScreenGrayscale(), reward, self.p.game_over()

    @property
    def action_space(self):
        return self._action_space

    @action_space.setter
    def action_space(self, action_space):
        self._action_space = action_space
Esempio n. 3
0
class WrappedFlappyBird():
    def __init__(self):
        self.score_counter = 0
        self.game = FlappyBird()
        self.env = PLE(self.game, fps=30, display_screen=True)

    def frame_step(self, action_vector):
        if action_vector[0] == 1:
            self.env.act(119)
        elif action_vector[1] == 1:
            self.env.act(1)

        frame = self.env.getScreenRGB()
        reward = self.get_action_reward()
        game_over = self.game.game_over()

        if game_over:
            self.game.reset()

        return frame, reward, game_over

    def get_action_reward(self):
        if self.game.game_over():
            self.score_counter = 0
            return -1
        elif self.score_counter < self.game.getScore():
            self.score_counter = self.game.getScore()
            return 1
        else:
            return 0.1
Esempio n. 4
0
def evaluate(agent):
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []
    for i in range(5):
        env.init()
        env.reset_game()
        obs = list(env.getGameState().values())
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            observation = env.getScreenRGB()
            score = env.score()
            #action = agent.pickAction(reward, observation)
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            cv2.imshow("ss", observation)
            cv2.waitKey(10)  # 预测动作,只选最优动作
            reward = env.act(actionset[action])
            obs = list(env.getGameState().values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
    return np.mean(eval_reward)
Esempio n. 5
0
def main(argv):
    try:
        opts, _ = getopt.getopt(argv, "hr")
    except getopt.GetoptError:
        print("birdML.py [-h | -r]")
        sys.exit(2)

    record = False
    for opt, arg in opts:
        if opt == '-h':
            print("-h to help")
            print("-r record")
        elif opt == '-r':
            record = True

    netb = netBrain()
    netb.summary()
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    p.init()
    actions = p.getActionSet()

    out = 1

    epochs = 50
    for i in range(epochs):
        lstates = []
        rewards = []
        if record:
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc,
                                  30.0, (288, 512))
        for d in range(10):
            while not p.game_over():
                if record:
                    obs = p.getScreenRGB()
                    obs = cv2.transpose(obs)
                    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
                    out.write(obs)
                st = game.getGameState()
                gstate = list(st.values())
                gstate = np.array([np.array(gstate)])
                lstates.append(gstate[0])
                pred = netb.predict(gstate)[0]
                a = pred.argmax()
                p.act(actions[a])
                if st['next_pipe_bottom_y'] < st['player_y']:
                    pred[0] = 1.0
                    pred[1] = 0.0
                elif st['next_pipe_top_y'] > st['player_y']:
                    pred[0] = 0.0
                    pred[1] = 1.0
                rewards.append(pred)
            p.reset_game()
        netb.fit(np.array(lstates),
                 np.array(rewards),
                 batch_size=10,
                 epochs=10)
        if record:
            out.release()
Esempio n. 6
0
class Game:
    def __init__(self, game="pixelcopter", fps=30):
        os.environ['SDL_VIDEODRIVER'] = 'dummy'
        self.game_name = game
        if game == "flappy":
            engine = FlappyBird()
        elif game == "pixelcopter":
            engine = Pixelcopter()
        else:
            assert False, "This game is not available"
        engine.rewards["loss"] = -5  # reward at terminal state
        self.reward_terminal = -5
        self.game = PLE(engine, fps=fps, display_screen=False)
        self.game.init()
        self.game.act(0)  # Start the game by providing arbitrary key as input
        self.key_input = self.game.getActionSet()
        self.reward = 0

    def game_over(self):
        return self.game.game_over()

    def reset_game(self):
        self.game.reset_game()
        self.game.act(0)  # Start the game

    def get_image(self):
        return self.game.getScreenRGB()

    def get_torch_image(self):
        image = self.game.getScreenRGB()
        if self.game_name == "flappy":
            image = image[:, :-96, :]  # Remove ground
            image = cv2.cvtColor(cv2.resize(image, (84, 84)),
                                 cv2.COLOR_BGR2GRAY)
            image = np.reshape(image, (84, 84, 1))
        elif self.game_name == "pixelcopter":
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            image = np.reshape(image, (48, 48, 1))
        image[image > 0] = 1
        image = image.transpose(2, 0, 1)  #CHW
        image = image.astype(np.float32)
        image = torch.from_numpy(image)
        return image

    def act(self, action_idx):
        self.reward = self.game.act(self.key_input[action_idx])
        return self.reward
Esempio n. 7
0
class SnakeEnv():
	def __init__(self, height=32, width=32, fps=15, frame_history_size=4):
		# create the game environment and initialize the attribute values

		self.game = Snake(height=height,width=width,init_length=4)
		reward_dict = {"positive": 1.0, "negative": -1.0, "tick": 0.0, "loss": -1.0, "win": 1.0}
		self.environment = PLE(self.game, fps=fps, reward_values=reward_dict, num_steps=2)
		self.init_env() # initialize the game
		self.allowed_actions = self.environment.getActionSet() # the list of allowed actions to be taken by an agent
		self.num_actions = len(self.allowed_actions) - 1  # number of actions that are allowed in this env
		self.frame_hist = frame_history(height=height,width=width,frame_history_size=frame_history_size,num_channels=3);
		self.input_shape = self.frame_hist.get_history().shape  # shape of the game input screen

	def init_env(self):
		# initialize the variables and screen of the game
		self.environment.init()


	def get_current_state(self):
		# get the current state in the game. Returns the current screen of the game with snake and food positions with
		# a sequence of past .
		cur_frame = np.transpose(self.environment.getScreenRGB(),(2,0,1))
		#cur_frame = np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1))
		self.frame_hist.push(cur_frame)
		return self.frame_hist.get_history()


	def check_game_over(self):
		# check if the game has terminated
		return self.environment.game_over()

	def reset(self):
		# resets the game to initial values and refreshes the screen with a new small snake and random food position.
		self.environment.reset_game()
		_ = self.environment.act(None)
		self.frame_hist.reset(np.transpose(self.environment.getScreenRGB(),(2,0,1)))
		#self.frame_hist.reset(np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1)))
		return self.frame_hist.get_history()

	def take_action(self, action):
		# lets the snake take the chosen action of moving in some direction
		reward = self.environment.act(self.allowed_actions[action])
		next_state = self.get_current_state()
		done = self.check_game_over()
		return next_state, reward, done, 0
 def run_a_game(self,game):
     from ple import PLE
     p =  PLE(game,display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward,obs))
 def run_a_game(self, game):
     from ple import PLE
     p = PLE(game, display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward, obs))
class GameEnv(object):
    def __init__(self, display_screen):
        self.width = IMAGE_WIDTH
        self.height = IMAGE_HEIGHT

        self.count = 0
        self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen)
        self.p.init()
        self._update_state()
        self.score = 0

    def pre_process_image(self, image):
        self.count += 1
        image = color.rgb2gray(image)
        image = transform.resize(image, (self.width, self.height))
        image = exposure.rescale_intensity(image, out_range=(0, 255))
        image = image.astype('float')
        image = image / 255.0
        return image.reshape(1, self.width, self.height, 1)

    def _update_state(self):
        image = self.p.getScreenRGB()
        # TODO: convert to float
        image = self.pre_process_image(image)
        state = getattr(self, 'state', None)
        if state is None:
            self.state = np.concatenate([image] * 4, axis=3)
        else:
            self.state[:, :, :, :3] = image

    def get_state(self):
        return self.state

    def step(self, action):
        if action == 1:
            _ = self.p.act(119)
        else:
            _ = self.p.act(None)

        self._update_state()

        done = False
        if self.p.game_over():
            done = True
            self.p.reset_game()
            reward = -1
        else:
            reward = 0.1

        return_score = self.score + reward
        self.score = 0 if done else self.score + reward

        return self.state, reward, done, return_score

    def get_score(self):
        return self.score
Esempio n. 11
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
Esempio n. 12
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
def run():
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    #agent = myAgentHere(allowed_actions=p.getActionSet())

    p.init()
    reward = 0.0

    for i in range(150):
        if p.game_over():
            p.reset_game()

        observation = p.getScreenRGB()
        new_image = convert_image(observation)
        cv.imwrite("Imagenes/Gray_Image" + str(i) + ".jpg", new_image)
        action = None
        reward = p.act(action)
Esempio n. 14
0
def test_model_G(nb_games, model):
    game = FlappyBird(
        graphics="fixed"
    )  # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
    p = PLE(game,
            fps=30,
            frame_skip=1,
            num_steps=1,
            force_fps=True,
            display_screen=False)
    p.init()
    reward = 0.0

    cumulated = np.zeros((nb_games))
    list_actions = [0, 119]

    for i in range(nb_games):
        p.reset_game()

        while (not p.game_over()):
            state = game.getGameState()

            screen_x = process_screen(p.getScreenRGB())
            stacked_x = deque([screen_x, screen_x, screen_x, screen_x],
                              maxlen=4)
            x = np.stack(stacked_x, axis=-1)
            action = list_actions[np.argmax(
                model.predict(np.expand_dims(x, axis=0)))]

            reward = p.act(action)

            cumulated[i] = cumulated[i] + reward

    avg_score = np.mean(cumulated)
    print('Average : ' + str(avg_score))
    mx_score = np.max(cumulated)
    print('Max : ' + str(mx_score))
    return avg_score, mx_score
Esempio n. 15
0
def test():
    game2 = FlappyBird()
    p2 = PLE(game2,
             fps=30,
             frame_skip=1,
             num_steps=1,
             force_fps=True,
             display_screen=False)
    p2.init()
    reward = 0.0

    nb_games = 10
    cumulated = np.zeros((nb_games))
    for i in range(nb_games):
        p2.reset_game()

        while (not p2.game_over()):
            state = game2.getGameState()
            screen = p2.getScreenRGB()
            action = FlappyPolicy(state, screen)

            reward = p2.act(action)
            cumulated[i] = cumulated[i] + reward
    return np.mean(cumulated)
Esempio n. 16
0
def main(args):
    logs_path = args.logs_path
    video_path = args.video_path
    restore = args.restore
    train = args.train

    # Initial PLE environment
    os.putenv('SDL_VIDEODRIVER', 'fbcon')
    os.environ["SDL_VIDEODRIVER"] = "dummy"
    # Design reward
    reward_values = {
        "positive": 1, 
        "tick": 0.1, 
        "loss": -1,  
    }
    env = PLE(FlappyBird(), fps=30, display_screen=False, reward_values=reward_values)
    action_set = env.getActionSet()
    
    reply_buffer = Reply_Buffer(Config.reply_buffer_size)
    agent = Agent(action_set)

    reward_logs = []
    loss_logs = []

    # restore model
    if restore:
        agent.restore(restore)

    for episode in range(1, Config.total_episode+1):
        # reset env
        env.reset_game()
        env.act(0)
        obs = convert(env.getScreenGrayscale())
        state = np.stack([[obs for _ in range(4)]], axis=0)
        t_alive = 0
        total_reward = 0
        
        if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: 
            agent.stop_epsilon()
            frames = [env.getScreenRGB()] 
    
        while not env.game_over():
            action = agent.take_action(state)
            reward = env.act(action_set[action])
            if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: 
                frames.append(env.getScreenRGB()) 
            obs = convert(env.getScreenGrayscale())
            obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])
            state_new = np.append(state[:, 1:,...], obs, axis=1)
            action_onehot = np.zeros(len(action_set))
            action_onehot[action] = 1
            t_alive += 1
            total_reward += reward
            reply_buffer.append((state, action_onehot, reward, state_new, env.game_over()))
            state = state_new
        
        # save video
        # if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode:
        #     os.makedirs(video_path, exist_ok=True)
        #     clip = make_video(frames, fps=60).rotate(-90)
        #     clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60)
        #     agent.restore_epsilon()
        #     print('Episode: {} t: {} Reward: {:.3f}' .format(episode, t_alive, total_reward))
  
        if episode > Config.initial_observe_episode and train:
            # save model
            if episode % Config.save_logs_frequency == 0:
                agent.save(episode, logs_path)
                np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs))
                np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs))
        
            # update target network
            if episode % Config.update_target_frequency == 0:
                agent.update_target_network()
            
            # sample batch from reply buffer 
            batch_state, batch_action, batch_reward, batch_state_new, batch_over = reply_buffer.sample(Config.batch_size)
            
            # update policy network
            loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over)
        
            loss_logs.extend([[episode, loss]]) 
            reward_logs.extend([[episode, total_reward]]) 
        
            # print reward and loss
            if episode % Config.show_loss_frequency == 0: 
                print('Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}' .format(episode, t_alive, total_reward, loss))
        
            agent.update_epsilon()
Esempio n. 17
0
class Environment(object):
    def __init__(self,
                 env_name,
                 args,
                 atari_wrapper=False,
                 test=False,
                 seed=595):
        game = FlappyBird(width=144, height=256, pipe_gap=80)
        self.test = test
        #define reward
        reward_func = rewards = {
            "positive": 1,
            "negative": -1.0,
            "tick": 1,
            "loss": -5.0,
            "win": 1.0
        }

        self.p = PLE(game,
                     fps=30,
                     display_screen=False,
                     force_fps=True,
                     reward_values=reward_func,
                     rng=seed)
        self.observation = np.zeros((144, 256, 4, 3))
        # if atari_wrapper:
        #     clip_rewards = not test
        #     self.env = make_wrap_atari(env_name, clip_rewards)
        # else:
        #     self.env = gym.make(env_name)

        self.action_space = self.p.getActionSet()
        # self.observation_space = self.env.observation_space

    def reset(self):
        '''
        When running dqn:
            observation: np.array
                stack 4 last frames, shape: (84, 84, 4)

        When running pg:
            observation: np.array
                current RGB screen of game, shape: (210, 160, 3)
        '''

        self.p.reset_game()
        observation = self.p.getScreenRGB()
        self.observation[:, :, 0:-1, :] = self.observation[:, :, 1:, :]
        self.observation[:, :, -1, :] = observation
        return self.observation.reshape(144, 256, 12)

    def step(self, action):
        reward = self.p.act(action)

        observation = self.p.getScreenRGB()
        if self.p.game_over():
            done = True
        else:
            done = False
        self.observation[:, :, 0:-1, :] = self.observation[:, :, 1:, :]
        self.observation[:, :, -1, :] = observation

        return self.observation.reshape(144, 256, 12), reward, done, None

    def get_action_space(self):
        return self.action_space

    # def get_observation_space(self):
    #     return self.observation_space

    def get_random_action(self):
        return self.action_space.sample()
Esempio n. 18
0
    dqn_target = load_model(filepath=path_model)

    # Init game
    game = FlappyBird(graphics="fixed")
    # TODO: considering to change frame_skip ?
    # frame_skip=4 for some atari games, 3 for others.... To change?
    if params.DISPLAY_GAME:
        p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
    else:
        p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen='store_false')

    p.init()

    # Training
    p.reset_game()
    screen_x = process_screen(p.getScreenRGB())
    stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
    x = np.stack(stacked_x, axis=-1)
    replay_memory = MemoryBuffer(params.REPLAY_MEMORY_SIZE, screen_x.shape, (1,))

    # Evaluation barrier
    mean_score = 0
    training_score = 0

    # Deep Q-learning with experience replay
    for step in range(params.TOTAL_STEPS):
        logger_train.debug("Step {} / {} ----> epsilon={}".format(step, params.TOTAL_STEPS, epsilon(step)))
        print("Step {} / {} ----> epsilon={}".format(step, params.TOTAL_STEPS, epsilon(step)))

        if step % params.EVALUATION_PERIOD == 0 and step > 0 and params.EVALUATION and mean_score < 120:
            logger_train.info("Evaluating...")
Esempio n. 19
0
def experiment(device,
               reward_system,
               PIPEGAP,
               BATCH_SIZE,
               learning_rate,
               MEMORY_SIZE,
               GAMMA,
               EPS_START,
               EPS_END,
               EPS_DECAY,
               OBSERVE,
               FRAME_PER_ACTION,
               TARGET_UPDATE,
               num_episodes,
               save_model=False,
               load_model=False,
               load_model_path_prefix=None):
    expected_q_value = 0

    policy_net = RL.DQN().to(device)
    target_net = RL.DQN().to(device)
    if load_model:
        policy_net.load_state_dict(
            torch.load(load_model_path_prefix + "_policy_net.mdl"))
        target_net.load_state_dict(
            torch.load(load_model_path_prefix + "_target_net.mdl"))
    else:
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    memory = RL.ReplayMemory(MEMORY_SIZE)

    #Setup Game environment
    game = FlappyBird.FlappyBird(pipe_gap=PIPEGAP)
    env = PLE(game,
              fps=30,
              display_screen=True,
              force_fps=True,
              reward_values=reward_system)

    #Setup plot
    RLplot.plot_init()
    episode_durations = []

    # Main part with game execution

    env.init()
    steps_done = 0
    infinity = False

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset_game()
        state = env.getScreenRGB()
        state = RLip.BCHW_format(state)
        frames = (state, state, state, state)
        state = RLip.last_4_frames(state, frames[1], frames[2], frames[3])

        for t in count():
            # Select an action
            action, steps_done = RL.select_action(state, policy_net,
                                                  steps_done, device,
                                                  EPS_START, EPS_END,
                                                  EPS_DECAY, OBSERVE)
            if steps_done % FRAME_PER_ACTION != 0:
                action = torch.tensor([[1]], device=device, dtype=torch.long)

            # Perform an action
            reward = env.act(env.getActionSet()[action[0, 0]])
            next_state = env.getScreenRGB()
            done = env.game_over()
            reward = torch.tensor([reward], device=device)

            # Formatting next state for network
            if not done:
                next_state = RLip.BCHW_format(next_state)
                frames = (next_state, frames[0], frames[1], frames[2])
                next_state = RLip.last_4_frames(next_state, frames[1],
                                                frames[2], frames[3])
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)  # edit

            # Move to the next state
            state = next_state

            # Print Log of training info
            if steps_done <= OBSERVE:
                state_of_training = "observe"
            elif steps_done > OBSERVE and steps_done <= OBSERVE + EPS_DECAY:
                state_of_training = "explore"
            else:
                state_of_training = "train"
            print("TIMESTEP", steps_done, "/ STATE", state_of_training,\
                 "/ ACTION", action[0,0].data,"/ REWARD", reward[0].data,"/ Expected_Q",expected_q_value)

            # Perform one step of the optimization (on the target network)
            if steps_done > OBSERVE:
                RL.optimize_model(policy_net, target_net, memory, optimizer,
                                  device, BATCH_SIZE, GAMMA)
                if done:
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
                if t > 10000:
                    infinity = True
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
            else:
                if done:
                    break

        # Update the target network
        if i_episode % TARGET_UPDATE == 0 and steps_done > OBSERVE:
            target_net.load_state_dict(policy_net.state_dict())
        if infinity:
            break
    # End training process
    # Save experiment result
    data = {
        "data": episode_durations,
        'pipe_gap': PIPEGAP,
        'reward_values': reward_system,
        'BATCH_SIZE': BATCH_SIZE,
        'learning_rate': learning_rate,
        'MEMORY_SIZE': MEMORY_SIZE,
        'GAMMA': GAMMA,
        'EPS_START': EPS_START,
        'EPS_END': EPS_END,
        'EPS_DECAY': EPS_DECAY,
        'OBSERVE': OBSERVE,
        'FRAME_PER_ACTION': FRAME_PER_ACTION,
        'TARGET_UPDATE': TARGET_UPDATE,
        'num_episodes': num_episodes
    }
    filenameprefix = './result/Expe_' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S')
    filename = filenameprefix + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    # Save model if said so
    if save_model:
        torch.save(policy_net.state_dict(), filenameprefix + '_policy_net.mdl')
        torch.save(target_net.state_dict(), filenameprefix + '_target_net.mdl')

    # Save plot figure
    plotname = filenameprefix + '.png'
    RLplot.plot_end(plotname)
Esempio n. 20
0
class PLEFlappyBird():
    """
    PyGame Learning Environment for use only with FlappyBird.
    Does pre-processing specific to FlappyBird game.
    """
    def __init__(self, render=False, seed=0, pipe_gap=100):
        self.seed = seed
        print('SEED: {}'.format(self.seed))
        game = FlappyBird(pipe_gap=pipe_gap)
        self.env = PLE(game, fps=30, display_screen=render, rng=seed)
        self.env.init()
        self.full_state = np.zeros((1, 4, 80, 80), dtype=np.uint8)
        self.frame_sleep = 0.02

    def _prepro(self, frame):
        """Pre-process 288x512x3 uint8 frame into 80x80 uint8 frame."""
        frame = frame[:, :, 2]  # drop to one color channel
        frame = frame.T  # rotate 90 degrees
        frame[frame == 140] = 0  # filter out background
        frame[frame == 147] = 0
        frame[frame == 160] = 0
        frame[frame == 194] = 0
        frame[frame == 210] = 0
        frame[frame != 0] = 255  # set everything else to 255
        frame = cv2.resize(frame, (80, 80))  # downsample
        #show_frame(frame)  # DEBUG
        return frame

    def _add_frame(self, frame):
        """ Add single frame to state.  Used for processing multiple states over time."""
        self.full_state[:, 3, :, ::] = self.full_state[:, 2, :, ::]
        self.full_state[:, 2, :, ::] = self.full_state[:, 1, :, ::]
        self.full_state[:, 1, :, ::] = self.full_state[:, 0, :, ::]
        self.full_state[:, 0, :, ::] = frame

    def reset(self):
        """Reset the environment."""
        self.env.reset_game()
        frame = self.env.getScreenRGB()
        #print('reset() frame from environment:  {}'.format(frame.shape))  # DEBUG
        frame = self._prepro(frame)
        #print('reset() frame after _prepro():  {}'.format(frame.shape))  # DEBUG
        frame = np.expand_dims(frame, axis=0)
        #print('reset() frame after reshape:  {}'.format(frame.shape))  # DEBUG
        self._add_frame(frame)
        self._add_frame(frame)
        self._add_frame(frame)
        self._add_frame(frame)
        #print('reset():  {}'.format(self.full_state.shape))  # DEBUG
        #show_frames_2d(self.full_state)  # DEBUG
        return self.full_state.copy()

    def step(self, action):
        """Take a step in the environment."""
        reward = self.env.act(action)
        frame = self.env.getScreenRGB()
        done = True if self.env.game_over() else False
        #print('step() frame from environment:  {}'.format(frame))  # DEBUG
        frame = self._prepro(frame)
        #print('step() frame after _prepro():  {}'.format(frame))  # DEBUG
        frame = np.expand_dims(frame, axis=0)
        #print('step() frame after reshape:  {}'.format(frame))  # DEBUG
        self._add_frame(frame)
        #print('step():  {}'.format(self.full_state))  # DEBUG
        #show_frames_2d(self.full_state)  # DEBUG
        return self.full_state.copy(), reward, done

    def render(self):
        """
        Render the environment to visualize the agent interacting.
        Does nothing because rendering is handled by setting display_screen=True
        when creating the PLE() object.
        """
        pass
class OriginalGameEnv(gym.Env):
    def __init__(self, task={}):
        self._task = task
        os.environ['SDL_VIDEODRIVER'] = 'dummy'

        import importlib
        game_module = importlib.import_module('ple.games.originalgame')
        game = getattr(game_module, 'originalGame')()

        self.game_state = PLE(game, fps=30, display_screen=False)
        self._action_set = self.game_state.getActionSet()
        self.action_space = spaces.Discrete(len(self._action_set))
        self.screen_width, self.screen_height = self.game_state.getScreenDims()
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))

        self.num_actions = len(self._action_set)
        self.viewer = None

    def seed(self, seed=None):
        if not seed:
            seed = np.random.randint(2**31 - 1)
        rng = np.random.RandomState(seed)
        self.game_state.rng = rng
        self.game_state.game.rng = self.game_state.rng
        self.game_state.init()
        return [seed]

    def reset_task(self, task):
        pass

    def render(self, mode='human'):
        img = self._get_image()
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)

    def reset(self):
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.game_state.reset_game()
        state = self._get_image()
        return state

    def _get_image(self):
        image_rotated = np.fliplr(
            np.rot90(self.game_state.getScreenRGB(),
                     3))  # Hack to fix the rotated image returned by ple
        return image_rotated

    def step(self, action):
        reward = self.game_state.act(self._action_set[action])
        state = self._get_image()
        terminal = self.game_state.game_over()
        return state, reward, terminal, {}
Esempio n. 22
0
from ple.games.flappybird import FlappyBird
from ple import PLE
import random

game = FlappyBird()
p = PLE(game, fps=30, display_screen=True, force_fps=False)
p.init()

nb_frames = 1000
reward = 0.0

for f in range(nb_frames):
    if p.game_over():  #check if the game is over
        p.reset_game()

    obs = p.getScreenRGB()
    action = random.sample(p.getActionSet(), 1)[0]
    reward = p.act(action)
    print(action, reward)
Esempio n. 23
0
# env1.reset()
# for _ in range(1000):
#     env.render()
#     env.step(env.action_space.sample())  # take a random action
#     env1.render()
#     env1.step(env1.action_space.sample())  # take a random action

# from ple.games.pong import Pong
# from ple import PLE

# game = Pong()
# p = PLE(game, fps=30, display_screen=True, force_fps=False)
# p.init()
# 
from ple.games.flappybird import FlappyBird
from ple import PLE


game = FlappyBird()
p = PLE(game, fps=30, display_screen=True)

p.init()
reward = 0.0

for i in range(nb_frames):
   if p.game_over():
           p.reset_game()

   observation = p.getScreenRGB()
   action = agent.pickAction(reward, observation)
   reward = p.act(action)
	"""
	def __init__(self, actions):
		self.actions = actions

	def pickAction(self, reward, obs):
		return self.actions[np.random.randint(0, len(self.actions))]

###################################
game = Doom(scenario="take_cover")

env = PLE(game)
agent = NaiveAgent(env.getActionSet())
env.init()

reward = 0.0
for f in range(15000):
	#if the game is over
        if env.game_over():
            env.reset_game()
            
        action = agent.pickAction(reward, env.getScreenRGB())
        reward = env.act(action)

        if f > 2000:
            env.display_screen = True 
            env.force_fps = False
        
        if f > 2250:
            env.display_screen = True 
            env.force_fps = True
Esempio n. 25
0
import numpy as np
import pygame
from pygame.locals import *


class TestAgent():
	def __init__(self, actions):
		self.actions = actions
	def doAction(self,reward,obs):
		#print 'hello'
		for event in pygame.event.get():
			if event.type == KEYDOWN:
				return self.actions[0]
			return None

game = RunningMinion()
#game = WaterWorld()
p = PLE(game, fps=30, display_screen=True)
agent = TestAgent(p.getActionSet())

p.init()
reward = 0.0
nb_frames = 2000

for i in range(nb_frames):
	if p.game_over():
		p.reset_game()
	if i%1==0:
		obser = p.getScreenRGB()
		action = agent.doAction(reward,obser)
		reward = p.act(action)
Esempio n. 26
0
p = PLE(game,
        fps=30,
        frame_skip=1,
        num_steps=1,
        force_fps=True,
        display_screen=True)
p.init()
episode_counter = 0
counter = 0  # counter to control the reduction of epsilon

# store the previous observations in replay memory
D = deque()

# First action, don't flap.
p.act(ACTIONS[0])
x_t = p.getScreenRGB()
terminal = p.game_over()

x_t = skimage.color.rgb2gray(x_t)
x_t = skimage.transform.resize(x_t, (80, 80))
x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255))

s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

#In Keras, need to reshape
s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*80*80*4

#We go to training mode
epsilon = INITIAL_EPSILON
t = 0
Esempio n. 27
0
# 加载模型
save_path = '.\model_dir\model_6700_2823.0.ckpt'  #episode_reward: 1785.0
agent.restore(save_path)

obs = list(env.getGameState().values())
# #处理obs
# obs = preprocess(obs)
episode_reward = 0
while True:
    # 预测动作,只选最优动作
    action = agent.predict(obs)
    # 图像太快休眠
    # time.sleep(0.02) #延迟单位为秒
    # # 新建窗口显示分数
    observation = env.getScreenRGB()
    score = env.score()
    # 格式转换
    observation = cv2.cvtColor(observation, cv2.COLOR_RGB2BGR)
    # 选择90度
    observation = cv2.transpose(observation)
    font = cv2.FONT_HERSHEY_SIMPLEX
    observation = cv2.putText(observation, "score:" + str(int(score)), (0, 30),
                              font, 0.6, (0, 0, 255), 2)
    cv2.imshow("flappybird", observation)
    cv2.waitKey(5)

    reward = env.act(actionset[action])
    obs = list(env.getGameState().values())
    # #处理obs
    # obs = preprocess(obs)
Esempio n. 28
0
                if next_pipe_bottom_y - 8 < player_pos_y:
                    return True

            return False


agent = NaiveAgent(p.getActionSet())
print p.getActionSet()
reward = 0.0

for i in range(nb_frames):

    if p.game_over():
        p.reset_game()

    observation = p.getScreenRGB()
    action = agent.pickAction(reward, observation)
    reward = p.act(action)
    state = game.getGameState()
    player_y = state["player_y"]
    distance = state["next_pipe_dist_to_player"]
    width = state["next_pipe_width"]
    #player_x = state["player_x"]
    pipe_x = state["next_pipe_x"]
    #dist = previousState["next_pipe_dist_to_player"]

    print distance
    print width
    #print player_x
    print pipe_x
    #print player_y
Esempio n. 29
0
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
class Agent:

    LEARNING_RATE = 1e-6
    BATCH_SIZE = 32
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300
    COPY = 1000
    T_COPY = 0
    INITIAL_IMAGES = np.zeros((80, 80, 4))

    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.model = Model(self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.model_negative = Model(self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.trainable = tf.trainable_variables()
        self.rewards = []

    def _assign(self):
        for i in range(len(self.trainable) // 2):
            assign_op = self.trainable[i + len(self.trainable) // 2].assign(
                self.trainable[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _get_image(self, image):
        r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size=(80, 80))

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        Q_new_negative = sess.run(
            self.model_negative, feed_dict={self.model_negative.X: new_states})
        replay_size = len(replay)
        X = np.empty((replay_size, 80, 80, 4))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, dead_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not dead_r:
                target[action_r] += self.GAMMA * Q_new_negative[
                    i, np.argmax(Q_new[i])]
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.model.logits,
                             feed_dict={self.model.X: inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            state = self._get_image(self.env.getScreenRGB())
            for k in range(self.INITIAL_IMAGES.shape[2]):
                self.INITIAL_IMAGES[:, :, k] = state
            dead = False
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign()
                action = self._select_action(self.INITIAL_IMAGES)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                state = self._get_image(self.env.getScreenRGB())
                new_state = np.append(state.reshape([80, 80, 1]),
                                      self.INITIAL_IMAGES[:, :, :3],
                                      axis=2)
                dead = self.env.game_over()
                self._memorize(self.INITIAL_IMAGES, action, reward, new_state,
                               dead)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y = self._construct_memories(replay)
                cost, _ = self.sess.run([self.cost, self.optimizer],
                                        feed_dict={
                                            self.X: X,
                                            self.Y: Y
                                        })
                self.INITIAL_IMAGES = new_state
                self.T_COPY += 1
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (
                1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Esempio n. 31
0
def evaluate(agent1, agent2, agent3):
    input("开始比赛")
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')

    frame_number = 0
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []

    for i in range(5):
        output_movie = cv2.VideoWriter(videoname + '_' + str(i) + '.mp4',
                                       fourcc, 20, (288, 512))
        env.init()
        env.reset_game()
        dstate = env.getGameState()
        # print(dstate)
        obs = list(dstate.values())

        last_obs = np.zeros_like(obs[0:8])
        episode_reward = 0
        while True:
            obs1 = obs[0:8]
            obs2 = obs[8:16]
            obs3 = obs[16:24]
            action1 = agent1.predict(obs1)
            action2 = agent2.predict(obs2)
            action3 = agent3.predict(last_obs, obs3)

            finalaction = 0
            if action1 == 0:
                finalaction += 1
            if action2 == 0:
                finalaction += 2
            if action3 == 0:
                finalaction += 4
            # print("action1: ", action1)
            # print("action2: ", action2)
            # print("action3: ", action3)
            # print("action: ", finalaction)
            # print(obs)
            # print(obs1)
            # print(obs2)
            # print(obs3)
            if finalaction == 0:
                finalaction = None
            score = env.score()

            observation = env.getScreenRGB()
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            ss = observation.shape
            observation = cv2.resize(observation, (ss[1] * 2, ss[0] * 2))
            output_movie.write(observation)
            cv2.imshow("ss", observation)
            cv2.waitKey(30)  # 预测动作,只选最优动作

            reward = env.act(finalaction)
            last_obs = obs3
            dstate = env.getGameState()
            # print(dstate)
            obs = list(dstate.values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
            # input()
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
        output_movie.release()
        input()
    return np.mean(eval_reward)
Esempio n. 32
0
import FlappyPolicy

game = FlappyBird()
p = PLE(game,
        fps=30,
        frame_skip=1,
        num_steps=1,
        force_fps=False,
        display_screen=True)

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()

    while (not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action = FlappyPolicy(state,
                              screen)  ### Your job is to define this function.

        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
          display_screen=display_screen)

# our Naive agent!
agent = NaiveAgent(env.getActionSet())

# init agent and game.
env.init()

# lets do a random number of NOOP's
for i in range(np.random.randint(0, max_noops)):
    reward = env.act(env.NOOP)

# start our training loop
for f in range(nb_frames):
    # if the game is over
    if env.game_over():
        env.reset_game()

    obs = env.getScreenRGB()
    action = agent.pickAction(reward, obs)
    reward = env.act(action)

    # if f % 50 == 0:
    #     p.saveScreen("tmp/screen_capture.png")

    print f

    if f > 50:
        env.display_screen = True
        env.force_fps = True
class Agent:

    LEARNING_RATE = 1e-6
    BATCH_SIZE = 32
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300
    INITIAL_IMAGES = np.zeros((80, 80, 4))

    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState

        def conv_layer(x, conv, stride=1):
            return tf.nn.conv2d(x,
                                conv, [1, stride, stride, 1],
                                padding='SAME')

        def pooling(x, k=2, stride=2):
            return tf.nn.max_pool(x,
                                  ksize=[1, k, k, 1],
                                  strides=[1, stride, stride, 1],
                                  padding='SAME')

        self.X = tf.placeholder(tf.float32, [None, 80, 80, 4])
        self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
        w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1))
        b_conv1 = tf.Variable(tf.truncated_normal([32], stddev=0.01))
        conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4) + b_conv1)
        pooling1 = pooling(conv1)
        w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1))
        b_conv2 = tf.Variable(tf.truncated_normal([64], stddev=0.01))
        conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2) + b_conv2)
        w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1))
        b_conv3 = tf.Variable(tf.truncated_normal([64], stddev=0.01))
        conv3 = tf.nn.relu(conv_layer(conv2, w_conv3) + b_conv3)
        pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(
            conv3.shape[3])
        conv3 = tf.reshape(conv3, [-1, pulling_size])
        self.tensor_action, self.tensor_validation = tf.split(conv3, 2, 1)
        w_action1 = tf.Variable(
            tf.truncated_normal([pulling_size // 2, 256], stddev=0.1))
        w_action2 = tf.Variable(
            tf.truncated_normal([256, self.OUTPUT_SIZE], stddev=0.1))
        w_validation1 = tf.Variable(
            tf.truncated_normal([pulling_size // 2, 256], stddev=0.1))
        w_validation2 = tf.Variable(tf.truncated_normal([256, 1], stddev=0.1))
        fc_action1 = tf.nn.relu(tf.matmul(self.tensor_action, w_action1))
        fc_action2 = tf.matmul(fc_action1, w_action2)
        fc_validation1 = tf.nn.relu(
            tf.matmul(self.tensor_validation, w_validation1))
        fc_validation2 = tf.matmul(fc_validation2, w_validation2)
        self.logits = fc_validation2 + tf.subtract(
            fc_action2, tf.reduce_mean(fc_action2, axis=1, keep_dims=True))
        self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _get_image(self, image):
        r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size=(80, 80))

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        replay_size = len(replay)
        X = np.empty((replay_size, 80, 80, 4))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, dead_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not dead_r:
                target[action_r] += self.GAMMA * np.amax(Q_new[i])
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.logits, feed_dict={self.X: inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            state = self._get_image(self.env.getScreenRGB())
            for k in range(self.INITIAL_IMAGES.shape[2]):
                self.INITIAL_IMAGES[:, :, k] = state
            dead = False
            while not dead:
                action = self._select_action(self.INITIAL_IMAGES)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                state = self._get_image(self.env.getScreenRGB())
                new_state = np.append(state.reshape([80, 80, 1]),
                                      self.INITIAL_IMAGES[:, :, :3],
                                      axis=2)
                dead = self.env.game_over()
                self._memorize(self.INITIAL_IMAGES, action, reward, new_state,
                               dead)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y = self._construct_memories(replay)
                cost, _ = self.sess.run([self.cost, self.optimizer],
                                        feed_dict={
                                            self.X: X,
                                            self.Y: Y
                                        })
                self.INITIAL_IMAGES = new_state
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (
                1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
reward = 0.0
max_noops = 20
nb_frames = 15000

#make a PLE instance.
p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, 
	force_fps=force_fps, display_screen=display_screen)

#our Naive agent!
agent = NaiveAgent(p.getActionSet())

#init agent and game.
p.init()

#lets do a random number of NOOP's
for i in range(np.random.randint(0, max_noops)):
	reward = p.act(p.NOOP)

#start our training loop
for f in range(nb_frames):
	#if the game is over
        if p.game_over():
            p.reset_game()
            
        obs = p.getScreenRGB()
        action = agent.pickAction(reward, obs)
        reward = p.act(action)

	if f % 50 == 0:
		p.saveScreen("screen_capture.png")
Esempio n. 36
0
class PLEEnv(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}

    def __init__(self, game_name='FlappyBird', display_screen=True):
        # open up a game state to communicate with emulator
        import importlib
        game_module_name = ('ple.games.%s' % game_name).lower()
        game_module = importlib.import_module(game_module_name)
        game = getattr(game_module, game_name)()
        self.game_state = PLE(game, fps=30, display_screen=display_screen)
        self.game_state.init()
        self._action_set = self.game_state.getActionSet()
        self.action_space = spaces.Discrete(len(self._action_set))
        self.screen_width, self.screen_height = self.game_state.getScreenDims()
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.viewer = None

    def _step(self, a):
        reward = self.game_state.act(self._action_set[a])
        state = self._get_image()
        terminal = self.game_state.game_over()
        return state, reward, terminal, {}

    def _get_image(self):
        image_rotated = np.fliplr(
            np.rot90(self.game_state.getScreenRGB(),
                     3))  # Hack to fix the rotated image returned by ple
        return image_rotated

    @property
    def _n_actions(self):
        return len(self._action_set)

    # return: (states, observations)
    def _reset(self):
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(self.screen_width,
                                                   self.screen_height, 3))
        self.game_state.reset_game()
        state = self._get_image()
        return state

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        img = self._get_image()
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)

    def _seed(self, seed):
        rng = np.random.RandomState(seed)
        self.game_state.rng = rng
        self.game_state.game.rng = self.game_state.rng

        self.game_state.init()
Esempio n. 37
0
class Agent:

    MEMORY_SIZE = 300
    BATCH = 32
    POPULATION_SIZE = 15
    SIGMA = 0.1
    LEARNING_RATE = 0.03
    EPSILON = 1
    MIN_EPSILON = 0.1
    WATCHING = 10000
    FEATURES = 8
    GAMMA = 0.99
    MEMORIES = deque()
    INITIAL_IMAGES = np.zeros((80, 80, 4))
    # based on documentation, features got 8 dimensions

    def __init__(self, model, screen=False, forcefps=True):
        self.model = model
        self.game = MonsterKong()
        self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)

    def _get_image(self, image):
        r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size = (80, 80))

    def _map_action(self, action):
        if action == 0:
            return 97
        if action == 1:
            return 100
        if action == 2:
            return 119
        if action == 3:
            return 115
        if action == 4:
            return 32

    def _memorize(self, state, action, reward, new_state, done):
        self.MEMORIES.append((state, action, reward, new_state, done))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.model.predict(states)
        Q_new = self.model.predict(new_states)
        replay_size = len(replay)
        X = np.empty((replay_size, 80, 80, 4))
        Y = np.empty((replay_size, 5))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, done_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not done_r:
                target[action_r] += self.GAMMA * np.amax(Q_new[i])
            X[i] = state_r
            Y[i] = target
        return X, Y

    def get_predicted_action(self, sequence):
        if random.random() > self.EPSILON:
            prediction = np.argmax(self.model.predict(np.array(sequence))[0])
        else:
            prediction = np.random.randint(5)
        self.EPSILON -= (self.EPSILON / self.WATCHING)
        return prediction

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def save(self, checkpoint_name):
        with open('%s-weight.p'%(checkpoint_name), 'wb') as fopen:
            pickle.dump(self.model.get_weights(), fopen)

    def load(self, checkpoint_name):
        with open('%s-weight.p'%(checkpoint_name), 'rb') as fopen:
            self.model.set_weights(pickle.load(fopen))

    def get_reward(self, weights):
        self.model.set_weights(weights)
        self.env.reset_game()
        state = self._get_image(self.env.getScreenRGB())
        for i in range(self.INITIAL_IMAGES.shape[2]):
            self.INITIAL_IMAGES[:,:,i] = state
        dead = False
        while not dead:
            action = self.get_predicted_action([self.INITIAL_IMAGES])
            real_action = self._map_action(action)
            reward = self.env.act(real_action)
            reward += random.choice([0.0001, -0.0001])
            state = self._get_image(self.env.getScreenRGB())
            new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis = 2)
            dead = self.env.game_over()
            self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead)
            self.INITIAL_IMAGES = new_state
        batch_size = min(len(self.MEMORIES), self.BATCH)
        replay = random.sample(self.MEMORIES, batch_size)
        X, Y = self._construct_memories(replay)
        actions = self.model.predict(X)
        return -np.mean(np.square(Y - actions))

    def fit(self, iterations, checkpoint):
        self.es.train(iterations,print_every=checkpoint)

    def play(self, debug=False, not_realtime=False):
        total_reward = 0.0
        current_reward = 0
        self.env.force_fps = not_realtime
        self.env.reset_game()
        state = self._get_image(self.env.getScreenRGB())
        for k in range(self.INITIAL_IMAGES.shape[2]):
            self.INITIAL_IMAGES[:,:,k] = state
        done = False
        while not done:
            state = self.get_state()
            action = np.argmax(self.predict(np.array([self.INITIAL_IMAGES]))[0])
            real_action = 119 if action == 1 else None
            action_string = 'eh, jump!' if action == 1 else 'erm, do nothing..'
            if debug and total_reward > current_reward:
                print(action_string, 'total rewards:', total_reward)
            current_reward = total_reward
            total_reward += self.env.act(real_action)
            done = self.env.game_over()
        print('game over!')