Ejemplo n.º 1
0
class PLEEnv(Env):
    def __init__(self, game, _id, render=True, reset_done=True, num_steps=100):
        super().__init__(_id, render, reset_done)
        self.num_steps = num_steps
        self.game = game
        self.start()

    def start(self):
        if not self.env_instance:
            self.env_instance = PLE(self.game,
                                    fps=30,
                                    display_screen=self.render)
            self.env_instance.init()

    def step(self, action):
        reward = self.env_instance.act(action)
        obs = self.env_instance.getGameState()
        done = self.env_instance.game_over()
        return obs, reward, done

    def reset(self):
        self.env_instance.reset_game()
        obs = self.env_instance.getGameState()
        return obs

    def close(self):
        pass

    def restart(self):
        self.close()
        self.reset()
Ejemplo n.º 2
0
def evaluate(agent):
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []
    for i in range(5):
        env.init()
        env.reset_game()
        obs = list(env.getGameState().values())
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            observation = env.getScreenRGB()
            score = env.score()
            #action = agent.pickAction(reward, observation)
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            cv2.imshow("ss", observation)
            cv2.waitKey(10)  # 预测动作,只选最优动作
            reward = env.act(actionset[action])
            obs = list(env.getGameState().values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
    return np.mean(eval_reward)
class PleEnvAdapter(EnvAdapter):
    """Pygame learning env adapter"""
    def __init__(self, *args, **kwargs):
        super(PleEnvAdapter, self).__init__(*args, **kwargs)

        if not self.render:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"

        Game = envs_lookup_table[self.env_name]
        self.env = PLE(Game(),
                       display_screen=self.render,
                       force_fps=not self.render)
        self.env.init()

    def get_input_shape(self):
        return (len(self.env.getGameState()), )

    def reset(self):
        self.env.reset_game()

    def step(self, action) -> (object, float, bool):
        reward = self.env.act(self.env.getActionSet()[action])
        observation = self.env.getGameState()
        observation = [val for key, val in observation.items()]
        done = self.env.game_over()
        return observation, reward, done

    def get_n_actions(self) -> int:
        return len(self.env.getActionSet())

    def get_random_action(self):
        return random.randint(0, len(self.env.getActionSet()) - 1)
Ejemplo n.º 4
0
def test_movement_up():
    game = Pong()
    p = PLE(game, display_screen=True, fps=20, force_fps=1)
    p.init()
    time.sleep(.5)
    oldState = p.getGameState()
    p.act(game.actions["up"])
    newState = p.getGameState()
    assert oldState["player_velocity"] > newState["player_velocity"]
Ejemplo n.º 5
0
class Game(gym.Env):
    def __init__(self, display_screen=False, force_fps=True):
        os.environ["SDL_VIDEODRIVER"] = "dummy"
        game = FlappyBird()  # define and initiate the environment
        self.env = PLE(game,
                       fps=30,
                       display_screen=display_screen,
                       force_fps=force_fps)
        self.env.init()
        # list of actions in the environment
        self.actions = self.env.getActionSet()
        # length of actions
        self.action_space = spaces.Discrete(len(self.actions))

    def step(self, action):
        """Take the action chosen and update the reward"""
        reward = self.env.act(self.actions[action])
        state = self.getGameState()
        terminal = self.env.game_over()

        # If the bird is stuck, the game is over and a reward of -1000
        # if it continues, +1
        if terminal:
            reward = -1000
        else:
            reward = 1

        return state, reward, terminal, {}

    def getGameState(self):
        '''
        PLEenv return gamestate as a dictionary. Returns a modified form
        of the gamestate only with the required information to define the state
        '''
        state = self.env.getGameState()
        h_dist = state['next_pipe_dist_to_player']
        v_dist = state['next_pipe_bottom_y'] - state['player_y']
        vel = state['player_vel']

        return ' '.join([str(vel), str(h_dist), str(v_dist)])

    def reset(self):
        """Resets the game to start a new game"""
        self.env.reset_game()
        state = self.env.getGameState()
        return state

    def seed(self, seed):
        rng = np.random.RandomState(seed)
        self.env.rng = rng
        self.env.game.rng = self.env.rng

        self.env.init()
Ejemplo n.º 6
0
def test():
    # 创建环境
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env.getGameState())
    act_dim = len(env.getActionSet())
    print('action set:', env.getActionSet())
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(algorithm,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.3,
                  e_greed_decrement=1e-6)

    # 加载模型
    save_path = './DQN/checkpoints/episode_V14600.ckpt'
    print('checkpoints:', save_path)
    if os.path.exists(save_path):
        logger.info('load ckpt success!')
        agent.restore(save_path)
    else:
        logger.error('load ckpt error!')

    action_set = env.getActionSet()
    env.init()
    episode_reward = 0
    steps = 0
    while not env.game_over():
        steps += 1
        if (steps == 1):
            continue
        obs = list(env.getGameState().values())
        action_idx = agent.predict(obs)  # 预测动作,只选最优动作
        act = action_set[action_idx]
        reward = env.act(act)
        episode_reward += reward
        reward_str = str(int(episode_reward))
        drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0),
                 (255, 255, 255))
    env.reset_game()
    logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
Ejemplo n.º 7
0
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True, force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())

    print(obs_shape, action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6  #1e-6
    )  # probability of exploring is decreasing during training

    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")
    eval_reward = evaluate(agent, penv)
Ejemplo n.º 8
0
class Env:
    def __init__(self):
        # initializing the instance of FlappyBird class
        self.game = FlappyBird(pipe_gap=100)
        # then pass this object into PLE constructor and create an instance of that
        self.env = PLE(self.game, fps=30, display_screen=False)
        # init does some necessary things under the hood
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary
        self.action_map = self.env.getActionSet()

    # function which takes an action
    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
class Env:
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]

  def step(self, action):
    action = self.action_map[action]
    reward = self.env.act(action)
    done = self.env.game_over()
    obs = self.get_observation()
    # don't bother returning an info dictionary like gym
    return obs, reward, done

  def reset(self):
    self.env.reset_game()
    return self.get_observation()

  def get_observation(self):
    # game state returns a dictionary which describes
    # the meaning of each value
    # we only want the values
    obs = self.env.getGameState()
    return np.array(list(obs.values()))

  def set_display(self, boolean_value):
    self.env.display_screen = boolean_value
Ejemplo n.º 10
0
def main_test():
    final_score = 0
    previous_action = 1
    model = build_neural_network_model()
    game = FlappyBird(width=288, height=512, pipe_gap=100)
    env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state)
    model = load_model("model.h5")
    env.init()
    passed = 0
    old_y = 0
    for i in range(game_steps):
        if i == game_steps - 1:
            print("Score: {}".format(final_score))
        if env.game_over():
            print("Final Score: {}".format(final_score))
            time.sleep(1)
            final_score = 0
            env.reset_game()

        observation = env.getGameState()

        vector = model.predict(np.matrix(list(observation[0].values())))
        a_star = np.argmax(vector[0])
        print(vector[0][0], vector[0][1], a_star)
        time.sleep(0.05)
        env_reward = env.act(env.getActionSet()[a_star])
        if env_reward == 1:
            final_score += 1
Ejemplo n.º 11
0
class Env:
    def __init__(self):
        self.game = FlappyBird(pipe_gap=110)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary

        # by convention we want to use (0,1)
        # but the game uses (None, 119)
        self.action_map = self.env.getActionSet()  # [None, 119]

    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True,force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())


    print(obs_shape,action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(
        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6   #1e-6
    )  # probability of exploring is decreasing during training




    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, penv, rpm)

    max_episode = 1000

    # start train
    episode = 0
    while episode < max_episode:

        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, penv, rpm)
            episode += 1

        eval_reward = evaluate(agent, penv)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))
        # 训练结束,保存模型
        save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward)
        agent.save(save_path)

    # 训练结束,保存模型
    save_path = './dqn_model.ckpt'
    agent.save(save_path)
Ejemplo n.º 13
0
def test():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env.getGameState())
    action_dim = 2  # 只能是up键,还有一个其它,所以是2
    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_dim,
        act_dim=action_dim,
        e_greed=0.2,  # explore
        e_greed_decrement=1e-6)
    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')
    # test part, run 5 episodes and average
    eval_reward = []
    for i in range(5):
        env.init()
        episode_reward = 0
        isOver = False
        step = 0
        while not isOver:
            if step == 0:
                reward = env.act(None)
                done = False
            else:
                time.sleep(0.01)  #windows running too fast, slow it down
                obs = list(env.getGameState().values())
                action = agent.predict(obs)
                if action == 1:
                    act = actions["up"]
                else:
                    act = None
                reward = env.act(act)
                isOver = env.game_over()
                episode_reward += reward
            step += 1
            eval_reward.append(episode_reward)
            if step > MAX_STEP:
                break
        env.reset_game()
    return np.mean(eval_reward)
Ejemplo n.º 14
0
def train():
    # 创建环境
    game = FlappyBird()
    env_1 = PLE(game, fps=30, display_screen=False)
    env_2 = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env_1.getGameState())
    act_dim = len(env_1.getActionSet())
    print('action set:', env_1.getActionSet())
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(algorithm,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.3,
                  e_greed_decrement=1e-6)

    # 加载模型
    save_path = './flappybird.ckpt'
    if os.path.exists(save_path):
        agent.restore(save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env_1, agent, rpm)

    max_episode = 2000

    # 开始训练
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train
        for i in range(0, 100):
            total_reward, steps = run_episode(env_1, agent, rpm)
            episode += 1

        # test
        eval_reward, steps = evaluate(env_2, agent)
        logger.info(
            '[episode:{}], e_greed:{:.6f}, steps:{}, test_reward:{}'.format(
                episode, agent.e_greed, steps, eval_reward))
        # 保存模型
        ckpt = './models/episode_{}.ckpt'.format(episode)
        agent.save(ckpt)

    # 训练结束,保存模型
    save_path = './flappybird.ckpt'
    agent.save(save_path)
Ejemplo n.º 15
0
def test():
    game = Snake(600, 600)
    p = PLE(game,
            fps=60,
            state_preprocessor=process_state,
            force_fps=True,
            display_screen=True,
            frame_skip=2,
            reward_values={
                "positive": 100.0,
                "negative": -50.0,
                "tick": -0.1,
                "loss": -70.0,
                "win": 5.0
            })
    agent = Agent(alpha=float(sys.argv[1]),
                  gamma=float(sys.argv[2]),
                  n_actions=3,
                  epsilon=0.01,
                  batch_size=100,
                  input_shape=6,
                  epsilon_dec=0.99999,
                  epsilon_end=0.001,
                  memory_size=500000,
                  file_name=sys.argv[3],
                  activations=[str(sys.argv[4]),
                               str(sys.argv[5])])
    p.init()
    agent.load_game()
    scores = []

    for _ in range(200):
        if p.game_over():
            p.reset_game()
        apples = 0
        initial_direction = "Right"
        while not p.game_over():
            old_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))

            action = agent.choose_action(old_state)
            possible_directions = prepare_corect_directions(initial_direction)
            possible_directions_tuples = list(
                zip(possible_directions.keys(), possible_directions.values()))
            direction = possible_directions_tuples[action]
            initial_direction = direction[1]

            reward = p.act(direction[0])
            if reward > 50.0:
                apples += reward

        scores.append(apples)
    return scores
Ejemplo n.º 16
0
def main(train=False):
    # Don't modify anything in this function.
    # See the constants defined at the top of this file if you'd like to
    # change the FPS, screen size, or round length
    game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE)

    if train:
        p = PLE(game, fps=FPS, display_screen=False, force_fps=True)
    else:
        p = PLE(game, fps=FPS, display_screen=True, force_fps=False)

    p.init()

    agent_rounds = 0
    cpu_rounds = 0
    agent_score = 0
    cpu_score = 0
    num_frames = 0
    while True:
        if p.game_over():
            if game.score_counts['agent'] > game.score_counts['cpu']:
                agent_rounds += 1
                print('AGENT won round')
            else:
                cpu_rounds += 1
                print('CPU won round')

            if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS:
                break

            p.reset_game()

        obs = p.getGameState()
        action = agent(normalize(obs))
        reward = p.act(ACTION_MAP[action])

        if reward > 0:
            agent_score += 1
            print('AGENT scored')
        elif reward < 0:
            cpu_score += 1
            print('CPU scored')

        num_frames += 1

    winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU'
    print('Winner:', winner)
    print('Num frames      :', num_frames)
    print('AGENT rounds won:',agent_rounds)
    print('CPU   rounds won:',cpu_rounds)
    print('AGENT total score:',agent_score)
    print('CPU   total score:',cpu_score)
Ejemplo n.º 17
0
def main():

    env = PLE(Pixelcopter(),
              fps=30,
              display_screen=True,
              state_preprocessor=None)
    action_dim = len(env.getActionSet())
    obs_shape = len(env.getGameState())

    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=action_dim)
    algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.1,  # 有一定概率随机选取动作,探索
        e_greed_decrement=1e-6)  # 随着训练逐步收敛,探索的程度慢慢降低

    # 加载模型
    # save_path = './dqn_model.ckpt'
    # agent.restore(save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 30000

    # 开始训练
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        eval_reward, max_reward = evaluate(env, agent,
                                           render=False)  # render=True 查看显示效果
        logger.info(
            'episode:{}    e_greed:{}   test_reward:{}   max_reward:{}'.format(
                episode, agent.e_greed, eval_reward, max_reward))

    # 训练结束,保存模型
    save_path = './dqn_model.ckpt'
    agent.save(save_path)
Ejemplo n.º 18
0
def view_agent(agent):
    game = FlappyBird()
    p = PLE(game, fps=30, display_screen=True)
    p.init()

    for i in range(200000):
        if p.game_over():
            p.reset_game()

        time.sleep(0.03)
        action = agent.pick_action(p.getGameState())
        p.act(action)

        if p.game_over():
            break
Ejemplo n.º 19
0
def main():

    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()

    action_dim = len(env.getActionSet())
    obs_shape = len(env.getGameState())

    rpm = ReplayMemory(MEMORY_SIZE)

    model = FlappyBirdModel(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = FlappyBirdAgent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.2,
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 50000

    # start train
    episode = 0
    while episode < max_episode:

        # train part
        for i in range(0, 100):
            total_reward = run_episode(agent, env, rpm)
            episode += 1
        # evaluation part
        eval_reward = evaluate(agent, env)
        logger.info('episode:{}    test_reward:{}'.format(episode, eval_reward))
        # learning rate adjustment
        if episode % 100 == 0:
            if algorithm.lr >= 5e-4:
                algorithm.lr *= 0.995
            if algorithm.lr <= 5e-4 and algorithm.lr >= 1e-4:
                algorithm.lr *= 0.99
            print('learning rate:', algorithm.lr)

    # save model
    save_path = './fb_dqn_model.ckpt'
    agent.save(save_path)
Ejemplo n.º 20
0
    def __init__(self, game, size=10, mutationrate=0.1, crossoverrate=0.1):
        p = PLE(game, fps=30, display_screen=False)
        p.init()

        self.population = np.array([
            NNAgent(inputs=p.getGameState().keys(), actions=p.getActionSet())
            for i in range(size)
        ])
        for agent in self.population:
            agent.fitness = None

        self.fitness = [] * size
        self.generation = 0
        self.popsize = size
        self.mutationrate = mutationrate
        self.crossoverrate = crossoverrate
Ejemplo n.º 21
0
def main():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env_evaluate = PLE(game, fps=30, display_screen=False)
    obs_dim = len(env.getGameState())
    action_dim = 2  # 只能是up键,还有一个其它,所以是2

    # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim)
    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_dim,
        act_dim=action_dim,
        e_greed=0.2,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    # while rpm.size() < MEMORY_WARMUP_SIZE:  # warm up replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 5000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env_evaluate)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))

    agent.save('./model_dir')
Ejemplo n.º 22
0
def play(file_name, number_of_games=1):
    game = FlappyBird(width=game_width,
                      height=game_height,
                      pipe_gap=game_pipe_gap)

    p = PLE(game, display_screen=True, force_fps=False, frame_skip=6)
    p.init()

    network = Network()
    network.load(file_name, rename=False)

    for i in range(number_of_games):
        if i > 0:
            p.reset_game()
        while not p.game_over():
            state = p.getGameState()
            actions_q_values = network.Q(state).tolist()
            action_taken_index = np.argmax(actions_q_values)

            p.act(None if action_taken_index == 0 else 119)
Ejemplo n.º 23
0
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20):
    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = load_agent(env, agent_file_path, agent_file_name)
    env.init()

    print "Testing model:", agent_file_name

    total_reward = 0.0
    for _ in range(test_rounds):
        my_agent.start_episode()
        episode_reward = 0.0
        while env.game_over() == False:
            state = env.getGameState()
            reward, action = my_agent.act(state, epsilon=0.00)
            episode_reward += reward

        print "Agent score {:0.1f} reward for episode.".format(episode_reward)
        total_reward += episode_reward
        my_agent.end_episode()

    return total_reward/test_rounds
Ejemplo n.º 24
0
def main(w, seed=SEED, headless=False):
    """
    Let an agent play flappy bird
    """
    if headless:
        display_screen = False
        force_fps = True
    else:
        display_screen = True
        force_fps = False

    game = PLE(FLAPPYBIRD,
               display_screen=display_screen,
               force_fps=force_fps,
               rng=seed)
    game.init()
    game.reset_game()
    FLAPPYBIRD.rng.seed(seed)

    agent_score = 0
    num_frames = 0

    while True:
        if game.game_over():
            break

        obs = game.getGameState()
        x = normalize(obs)
        action = agent(x, w)

        reward = game.act(ACTION_MAP[action])

        if reward > 0:
            agent_score += 1

        num_frames += 1

    print('Frames  :', num_frames)
    print('Score   :', agent_score)
Ejemplo n.º 25
0
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = load_agent(env, agent_file_path, agent_file_name)
    env.init()

    print "Testing model:", agent_file_name

    total_reward = 0.0
    for _ in range(test_rounds):
        my_agent.start_episode()
        episode_reward = 0.0
        while env.game_over() == False:
            state = env.getGameState()
            reward, action = my_agent.act(state, epsilon=0.05)
            episode_reward += reward

        print "Agent score {:0.1f} reward for episode.".format(episode_reward)
        total_reward += episode_reward
        my_agent.end_episode()

    return total_reward/test_rounds
Ejemplo n.º 26
0
#coding:utf-8
from ple.games.pong import Pong
from ple import PLE
import numpy as np
def get_obs(env):
    # game_state = env.getGameState()
    # obs = list(game_state.values())
    obs = env.getScreenGrayscale()/255.0
    return obs.astype(np.float).ravel()


if __name__ == '__main__':
    game = Pong(width=128, height=96,MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    # 根据parl框架构建agent
    print(p.getActionSet())

    act_dim = len(p.getActionSet())
    p.getScreenGrayscale()
    game_state = p.getGameState()
    print(game_state)
Ejemplo n.º 27
0
if __name__ == "__main__":
    game = Catcher(width=320, height=320)
    env = PLE(game, display_screen=True, state_preprocessor=process_state)
    agent = DQNAgent(env)
    agent.load("./save/catcher.h5")

    #초기화
    #pylab.title("reward")
    #pylab.xlabel("episodes")
    #pylab.ylabel("rewards")
    env.init()
    scores, time = [], []
    for e in range(EPISODES):

        env.reset_game()
        state = env.getGameState()
        state = np.array([list(state[0])])
        score = 0
        for time_t in range(20000):
            action = agent.act(state)

            reward = env.act(action)  #액션 선택
            score += reward

            next_state = env.getGameState()
            next_state = np.array([list(next_state[0])])

            action = [K_a, None, K_d].index(action)

            agent.remember(state, action, reward, next_state, env.game_over())
            state = next_state
Ejemplo n.º 28
0
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000):
    # training parameters
    num_epochs = 5
    num_steps_train_epoch = num_steps_train_total/num_epochs  # steps per epoch of training
    num_steps_test = 100
    update_frequency = 10  # step frequency of model training/updates

    epsilon = 0.15  # percentage of time we perform a random action, help exploration.
    epsilon_steps = 1000  # decay steps
    epsilon_min = 0.1
    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # memory settings
    max_memory_size = 10000
    min_memory_size = 60  # number needed before model training starts

    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = init_agent(env)

    memory = utils.ReplayMemory(max_memory_size, min_memory_size)
    env.init()

    # Logging configuration and figure plotting
    logging.basicConfig(filename='../learning.log', filemode='w',
                        level=logging.DEBUG, format='%(levelname)s:%(message)s')
    logging.info('========================================================')
    logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n')
    learning_rewards = [0]
    testing_rewards = [0]

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train_epoch:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_train_epoch:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(my_agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if steps < num_steps_train_epoch:
                learning_rewards.append(episode_reward)

            if num_episodes % 5 == 0:
                # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes))

        steps, num_episodes = 0, 0
        losses, rewards = [], []

        # testing loop
        while steps < num_steps_test:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_test:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=0.05)

                episode_reward += reward
                testing_rewards.append(testing_rewards[-1]+reward)
                steps += 1

                # done watching after 500 steps.
                if steps > 500:
                    env.display_screen = False

            if num_episodes % 5 == 0:
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            if steps < num_steps_test:
                testing_rewards.append(episode_reward)

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes))

    logging.info("Training complete.\n\n")
    plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total)
    plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total)

    save_agent(my_agent, agent_file_path, agent_file_name)
    memory = ReplayMemory(max_memory_size, min_memory_size)

    env.init()
    
    for epoch in range(1, num_epochs+1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False
       
        #training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=epsilon)
                memory.add([ state, action, reward, env.game_over() ])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)
                    
                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1


            if num_episodes % 5 == 0:
Ejemplo n.º 30
0
    memory = ReplayMemory(max_memory_size, min_memory_size)

    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if num_episodes % 5 == 0:
                print "Episode {:01d}: Reward {:0.1f}".format(
Ejemplo n.º 31
0
import numpy as np

from flappy_bird_model import FlappyBirdModel
from flappy_bird_agent import FlappyBirdAgent

from ple.games.flappybird import FlappyBird
from ple import PLE

LEARNING_RATE = 0.001
GAMMA = 0.99

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)

action_dim = len(env.getActionSet())
obs_shape = len(env.getGameState())

model = FlappyBirdModel(act_dim=action_dim)
algorithm = parl.algorithms.DQN(model,
                                act_dim=action_dim,
                                gamma=GAMMA,
                                lr=LEARNING_RATE)
agent = FlappyBirdAgent(algorithm,
                        obs_dim=obs_shape,
                        act_dim=action_dim,
                        e_greed=0.2,
                        e_greed_decrement=1e-6)

# load model
save_path = './fb_dqn_model.ckpt'
agent.restore(save_path)
class Agent:

    LEARNING_RATE = 1e-6
    BATCH_SIZE = 32
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300
    COPY = 1000
    T_COPY = 0
    INITIAL_IMAGES = np.zeros((80, 80, 4))

    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.model = Model(self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.model_negative = Model(self.OUTPUT_SIZE, self.LEARNING_RATE)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.trainable = tf.trainable_variables()
        self.rewards = []

    def _assign(self):
        for i in range(len(self.trainable) // 2):
            assign_op = self.trainable[i + len(self.trainable) // 2].assign(
                self.trainable[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead):
        self.MEMORIES.append((state, action, reward, new_state, dead))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _get_image(self, image):
        r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return imresize(gray, size=(80, 80))

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        Q_new_negative = sess.run(
            self.model_negative, feed_dict={self.model_negative.X: new_states})
        replay_size = len(replay)
        X = np.empty((replay_size, 80, 80, 4))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, dead_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not dead_r:
                target[action_r] += self.GAMMA * Q_new_negative[
                    i, np.argmax(Q_new[i])]
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.model.logits,
                             feed_dict={self.model.X: inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess,
                        os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess,
                           os.getcwd() + "/%s.ckpt" % (checkpoint_name))
        with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            state = self._get_image(self.env.getScreenRGB())
            for k in range(self.INITIAL_IMAGES.shape[2]):
                self.INITIAL_IMAGES[:, :, k] = state
            dead = False
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign()
                action = self._select_action(self.INITIAL_IMAGES)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                state = self._get_image(self.env.getScreenRGB())
                new_state = np.append(state.reshape([80, 80, 1]),
                                      self.INITIAL_IMAGES[:, :, :3],
                                      axis=2)
                dead = self.env.game_over()
                self._memorize(self.INITIAL_IMAGES, action, reward, new_state,
                               dead)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y = self._construct_memories(replay)
                cost, _ = self.sess.run([self.cost, self.optimizer],
                                        feed_dict={
                                            self.X: X,
                                            self.Y: Y
                                        })
                self.INITIAL_IMAGES = new_state
                self.T_COPY += 1
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (
                1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i + 1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
Ejemplo n.º 33
0
class Agent:

    LEARNING_RATE = 0.003
    BATCH_SIZE = 32
    INPUT_SIZE = 8
    LAYER_SIZE = 500
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    INITIAL_FEATURES = np.zeros((4, INPUT_SIZE))
    MEMORIES = deque()
    MEMORY_SIZE = 300
    COPY = 1000
    T_COPY = 0
    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.actor = Actor('actor', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE)
        self.actor_target = Actor('actor-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE)
        self.critic = Critic('critic', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.critic_target = Critic('critic-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
        self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y)
        self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
        weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad)
        grads = zip(self.grad_actor, weights_actor)
        self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _assign(self, from_name, to_name):
        from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name)
        to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name)
        for i in range(len(from_w)):
            assign_op = to_w[i].assign(from_w[i])
            sess.run(assign_op)

    def _memorize(self, state, action, reward, new_state, dead, rnn_state):
        self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _construct_memories_and_train(self, replay):
        # state_r, action_r, reward_r, new_state_r, dead_r = replay
        # train actor
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        init_values = np.array([a[-1] for a in replay])
        Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states, self.actor.hidden_layer:init_values})
        Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states, self.actor_target.hidden_layer:init_values})
        grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values, self.critic.Y:Q})
        self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor.hidden_layer:init_values, self.actor_critic_grad:grads})

        # train critic
        rewards = np.array([a[2] for a in replay]).reshape((-1, 1))
        rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states, self.critic_target.hidden_layer: init_values,
                                                                             self.critic_target.Y:Q_target})
        for i in range(len(replay)):
            if not replay[0][-1]:
                rewards[i,0] += self.GAMMA * rewards_target
        cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer), feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values,
                                                                                      self.critic.Y:Q, self.critic.REWARD:rewards})
        return cost

    def save(self, checkpoint_name):
        self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            dead = False
            init_value = np.zeros((1, 2 * 512))
            state = self.get_state()
            for i in range(self.INITIAL_FEATURES.shape[0]):
                self.INITIAL_FEATURES[i,:] = state
            while not dead:
                if (self.T_COPY + 1) % self.COPY == 0:
                    self._assign('actor', 'actor-target')
                    self._assign('critic', 'critic-target')
                if np.random.rand() < self.EPSILON:
                    action = np.random.randint(self.OUTPUT_SIZE)
                else:
                    action, last_state = sess.run(self.model.logits,
                                                  self.model.last_state,
                                                  feed_dict={self.model.X:[self.INITIAL_FEATURES],
                                                             self.model.hidden_layer:init_values})
                    action, init_value = np.argmax(action[0]), last_state[0]
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = np.append(self.get_state(), self.INITIAL_FEATURES[:3, :], axis = 0)
                dead = self.env.game_over()
                self._memorize(state, action, reward, new_state, dead, init_value)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                cost = self._construct_memories_and_train(replay)
                self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
                self.T_COPY += 1
            self.rewards.append(total_reward)
            if (i+1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)
class Agent:

    LEARNING_RATE = 0.003
    BATCH_SIZE = 32
    INPUT_SIZE = 8
    LAYER_SIZE = 500
    OUTPUT_SIZE = 2
    EPSILON = 1
    DECAY_RATE = 0.005
    MIN_EPSILON = 0.1
    GAMMA = 0.99
    MEMORIES = deque()
    MEMORY_SIZE = 300
    # based on documentation, features got 8 dimensions
    # output is 2 dimensions, 0 = do nothing, 1 = jump

    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE))
        self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE))
        input_layer = tf.Variable(tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE]))
        bias = tf.Variable(tf.random_normal([self.LAYER_SIZE]))
        output_layer = tf.Variable(tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE]))
        feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias)
        self.logits = tf.matmul(feed_forward, output_layer)
        self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []

    def _memorize(self, state, action, reward, new_state, done):
        self.MEMORIES.append((state, action, reward, new_state, done))
        if len(self.MEMORIES) > self.MEMORY_SIZE:
            self.MEMORIES.popleft()

    def _select_action(self, state):
        if np.random.rand() < self.EPSILON:
            action = np.random.randint(self.OUTPUT_SIZE)
        else:
            action = self.get_predicted_action([state])
        return action

    def _construct_memories(self, replay):
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])
        Q = self.predict(states)
        Q_new = self.predict(new_states)
        replay_size = len(replay)
        X = np.empty((replay_size, self.INPUT_SIZE))
        Y = np.empty((replay_size, self.OUTPUT_SIZE))
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, done_r = replay[i]
            target = Q[i]
            target[action_r] = reward_r
            if not done_r:
                target[action_r] += self.GAMMA * np.amax(Q_new[i])
            X[i] = state_r
            Y[i] = target
        return X, Y

    def predict(self, inputs):
        return self.sess.run(self.logits, feed_dict={self.X:inputs})

    def save(self, checkpoint_name):
        self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen:
            pickle.dump(self.rewards, fopen)

    def load(self, checkpoint_name):
        self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name))
        with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen:
            self.rewards = pickle.load(fopen)

    def get_predicted_action(self, sequence):
        prediction = self.predict(np.array(sequence))[0]
        return np.argmax(prediction)

    def get_state(self):
        state = self.env.getGameState()
        return np.array(list(state.values()))

    def get_reward(self, iterations, checkpoint):
        for i in range(iterations):
            total_reward = 0
            self.env.reset_game()
            done = False
            while not done:
                state = self.get_state()
                action  = self._select_action(state)
                real_action = 119 if action == 1 else None
                reward = self.env.act(real_action)
                total_reward += reward
                new_state = self.get_state()
                done = self.env.game_over()
                self._memorize(state, action, reward, new_state, done)
                batch_size = min(len(self.MEMORIES), self.BATCH_SIZE)
                replay = random.sample(self.MEMORIES, batch_size)
                X, Y = self._construct_memories(replay)
                cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={self.X: X, self.Y:Y})
            self.rewards.append(total_reward)
            self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i)
            if (i+1) % checkpoint == 0:
                print('epoch:', i + 1, 'total rewards:', total_reward)
                print('epoch:', i + 1, 'cost:', cost)

    def fit(self, iterations, checkpoint):
        self.get_reward(iterations, checkpoint)