コード例 #1
0
def test():
    game = Snake(600, 600)
    p = PLE(game,
            fps=60,
            state_preprocessor=process_state,
            force_fps=True,
            display_screen=True,
            frame_skip=2,
            reward_values={
                "positive": 100.0,
                "negative": -50.0,
                "tick": -0.1,
                "loss": -70.0,
                "win": 5.0
            })
    agent = Agent(alpha=float(sys.argv[1]),
                  gamma=float(sys.argv[2]),
                  n_actions=3,
                  epsilon=0.01,
                  batch_size=100,
                  input_shape=6,
                  epsilon_dec=0.99999,
                  epsilon_end=0.001,
                  memory_size=500000,
                  file_name=sys.argv[3],
                  activations=[str(sys.argv[4]),
                               str(sys.argv[5])])
    p.init()
    agent.load_game()
    scores = []

    for _ in range(200):
        if p.game_over():
            p.reset_game()
        apples = 0
        initial_direction = "Right"
        while not p.game_over():
            old_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))

            action = agent.choose_action(old_state)
            possible_directions = prepare_corect_directions(initial_direction)
            possible_directions_tuples = list(
                zip(possible_directions.keys(), possible_directions.values()))
            direction = possible_directions_tuples[action]
            initial_direction = direction[1]

            reward = p.act(direction[0])
            if reward > 50.0:
                apples += reward

        scores.append(apples)
    return scores
コード例 #2
0
    def __init__(self, *args, **kwargs):
        super(PleEnvAdapter, self).__init__(*args, **kwargs)

        if not self.render:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"

        Game = envs_lookup_table[self.env_name]
        self.env = PLE(Game(),
                       display_screen=self.render,
                       force_fps=not self.render)
        self.env.init()
コード例 #3
0
def eval_genomes(genomes, config):
    done = [False] * len(genomes)
    pl = []
    for i in range(len(genomes)):
        pl.append(PLE(game, fps=30, display_screen=True, force_fps=False))
        pl[i].init()
    while sum(done) != len(done):
        if len(pl) < len(genomes):
            pl.append(PLE(game, fps=30, display_screen=True, force_fps=False))
            done = done + [False]
            pl[-1].init()
        m = 0
        nets = []
        gid = []

        for i, (genome_id, genome) in enumerate(genomes):
            net = neat.nn.recurrent.RecurrentNetwork.create(genome, config)
            nets.append(net)
            gid.append(genome_id)

        nnOutput = [0] * len(genomes)
        rew = [0] * len(genomes)
        current_max_fitness = [0] * len(genomes)
        fitness_current = [0] * len(genomes)
        frame = [0] * len(genomes)
        counter = [0] * len(genomes)
        for i in range(len(genomes)):
            ob = list(np.zeros([288, 512, 3]) * len(genomes))
            ob.append(pl[i].getScreenRGB())
            frame[i] += 1
            ob[i] = cv2.resize(ob[i], (int(ob[i].shape[0]/8), int(ob[i].shape[1]/8)))
            ob[i] = cv2.cvtColor(ob[i], cv2.COLOR_BGR2GRAY)
            ob[i] = np.reshape(ob[i], (int(ob[i].shape[0]), int(ob[i].shape[1])))

            imgarray = np.ndarray.flatten(ob[i])

            nnOutput.append(np.argmax(nets[i].activate(imgarray)))
            rew[i] = (pl[i].act(119*np.argmax(nnOutput[i])))
            done[i] = pl[i].game_over()  # check if the game is over

            fitness_current[i] += float(rew[i])
            if fitness_current[i] > current_max_fitness[i]:
                current_max_fitness[i] = float(fitness_current[i])
                counter[i] = 0
            else:
                counter[i] += 1
            if sum(done) == len(done):
                m += config.pop_size - 1
                # print(gid[i], fitness_current)
                for k in range(len(pl)):
                    pl[k].reset_game()
            print(len(p.population), i+1)
            p.population[i+1].fitness = float(fitness_current[i])
 def __init__(self, screen=False, forcefps=True):
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
     self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE)
     self.sess = tf.InteractiveSession()
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver(tf.global_variables())
     self.trainable = tf.trainable_variables()
     self.rewards = []
コード例 #5
0
    def __init__(self):
        env = FlappyBird()
        self.p = PLE(env, add_noop_action=True)
        self.p.init()
        self.win_score = 10.
        action_space = len(self.p.getActionSet())
        state_space = len(self.p.getGameState())
        actions = ["up", "nothing"]
        state_names = list(self.p.getGameState().keys())

        Environment.__init__(self, env, action_space, state_space, actions,
                             state_names)
コード例 #6
0
 def __init__(self, model, screen=False, forcefps=True):
     self.model = model
     self.game = FlappyBird(pipe_gap=125)
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.es = Deep_Evolution_Strategy(self.model.get_weights(),
                                       self.get_reward,
                                       self.POPULATION_SIZE, self.SIGMA,
                                       self.LEARNING_RATE)
 def __init__(self, model, screen=False, forcefps=True):
     self.model = model
     self.game = Pixelcopter(width=int(48 * 5), height=int(48 * 5))
     self.env = PLE(self.game,
                    fps=30,
                    display_screen=screen,
                    force_fps=forcefps)
     self.env.init()
     self.env.getGameState = self.game.getGameState
     self.es = Deep_Evolution_Strategy(self.model.get_weights(),
                                       self.get_reward,
                                       self.POPULATION_SIZE, self.SIGMA,
                                       self.LEARNING_RATE)
コード例 #8
0
class Env:
    def __init__(self):
        # initializing the instance of FlappyBird class
        self.game = FlappyBird(pipe_gap=100)
        # then pass this object into PLE constructor and create an instance of that
        self.env = PLE(self.game, fps=30, display_screen=False)
        # init does some necessary things under the hood
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary
        self.action_map = self.env.getActionSet()

    # function which takes an action
    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
コード例 #9
0
def run(number_of_episodes):
    game = FlappyBird(pipe_gap=150)

    rewards = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }

    env = PLE(game=game,
              fps=30,
              display_screen=True,
              reward_values=rewards,
              force_fps=False)

    # Reset environment at the beginning
    env.reset_game()

    score = 0
    max_score = 0
    episode_number = 1

    while number_of_episodes > 0:

        # Get current state
        state = BasicQLearningAgent.get_state(env.game.getGameState())

        # Select action in state "state"
        action = basic_q_agent.max_q(state)

        # After choosing action, get reward
        """
        After choosing action, get reward.
        PLE environment method act() returns the reward that the agent has accumulated while performing the action.
        """
        reward = env.act(env.getActionSet()[action])
        score += reward

        max_score = max(score, max_score)

        game_over = env.game_over()

        if game_over:
            print("===========================")
            print("Episode: " + str(episode_number))
            print("Score: " + str(score))
            print("Max. score: " + str(max_score))
            print("===========================\n")
            # f.write("Score: " + str(score) + "|Max. score: " + str(max_score) + "\n")
            episode_number += 1
            number_of_episodes -= 1
            score = 0
            env.reset_game()
コード例 #10
0
class Env:
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]

  def step(self, action):
    action = self.action_map[action]
    reward = self.env.act(action)
    done = self.env.game_over()
    obs = self.get_observation()
    # don't bother returning an info dictionary like gym
    return obs, reward, done

  def reset(self):
    self.env.reset_game()
    return self.get_observation()

  def get_observation(self):
    # game state returns a dictionary which describes
    # the meaning of each value
    # we only want the values
    obs = self.env.getGameState()
    return np.array(list(obs.values()))

  def set_display(self, boolean_value):
    self.env.display_screen = boolean_value
コード例 #11
0
class Env:
    def __init__(self):
        self.game = FlappyBird(pipe_gap=110)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState  # maybe not necessary

        # by convention we want to use (0,1)
        # but the game uses (None, 119)
        self.action_map = self.env.getActionSet()  # [None, 119]

    def step(self, action):
        action = self.action_map[action]
        reward = self.env.act(action)
        done = self.env.game_over()
        obs = self.get_observation()
        return obs, reward, done

    def reset(self):
        self.env.reset_game()
        return self.get_observation()

    def get_observation(self):
        # game state returns a dictionary which describes
        # the meaning of each value
        # we only want the values
        obs = self.env.getGameState()
        return np.array(list(obs.values()))

    def set_display(self, boolean_value):
        self.env.display_screen = boolean_value
コード例 #12
0
ファイル: env.py プロジェクト: miNept/Intro2AI-HW
    def __init__(self, device, display=True):
        # Design reward
        reward_values = {
            "positive": 1,
            "tick": 0.1,
            "loss": -1,
        }
        self.env = PLE(FlappyBird(),
                       display_screen=display,
                       reward_values=reward_values)
        self.device = device
        self.action_set = self.env.getActionSet()

        self.frames = []
コード例 #13
0
 def __init__(self, game_name='FlappyBird', display_screen=True):
     # open up a game state to communicate with emulator
     import importlib
     game_module_name = ('ple.games.%s' % game_name).lower()
     game_module = importlib.import_module(game_module_name)
     game = getattr(game_module, game_name)()
     self.game_state = PLE(game, fps=30, display_screen=display_screen)
     self.game_state.init()
     self._action_set = self.game_state.getActionSet()
     self.action_space = spaces.Discrete(len(self._action_set))
     self.observation_space = spaces.Box(low=-np.inf,
                                         high=np.inf,
                                         shape=self._get_game_state().shape)
     self.viewer = None
コード例 #14
0
def test_agent(policy, file_writer=None, test_games=10, step=0):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()

    test_rewards = []
    for _ in range(test_games):
        env.reset_game()
        no_op(env)

        game_rew = 0

        while not env.game_over():

            state = flappy_game_state(env)

            action = 119 if policy(state) == 1 else None

            for _ in range(2):
                game_rew += env.act(action)

        test_rewards.append(game_rew)

        if file_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='test_performance', simple_value=game_rew)
            file_writer.add_summary(summary, step)
            file_writer.flush()

    return test_rewards
コード例 #15
0
class PLEEnv(Env):
    def __init__(self, game, _id, render=True, reset_done=True, num_steps=100):
        super().__init__(_id, render, reset_done)
        self.num_steps = num_steps
        self.game = game
        self.start()

    def start(self):
        if not self.env_instance:
            self.env_instance = PLE(self.game,
                                    fps=30,
                                    display_screen=self.render)
            self.env_instance.init()

    def step(self, action):
        reward = self.env_instance.act(action)
        obs = self.env_instance.getGameState()
        done = self.env_instance.game_over()
        return obs, reward, done

    def reset(self):
        self.env_instance.reset_game()
        obs = self.env_instance.getGameState()
        return obs

    def close(self):
        pass

    def restart(self):
        self.close()
        self.reset()
コード例 #16
0
    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState

        def conv_layer(x, conv, stride=1):
            return tf.nn.conv2d(x,
                                conv, [1, stride, stride, 1],
                                padding='SAME')

        def pooling(x, k=2, stride=2):
            return tf.nn.max_pool(x,
                                  ksize=[1, k, k, 1],
                                  strides=[1, stride, stride, 1],
                                  padding='SAME')

        self.X = tf.placeholder(tf.float32, [None, 80, 80, 4])
        self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE])
        w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1))
        b_conv1 = tf.Variable(tf.truncated_normal([32], stddev=0.01))
        conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4) + b_conv1)
        pooling1 = pooling(conv1)
        w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1))
        b_conv2 = tf.Variable(tf.truncated_normal([64], stddev=0.01))
        conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2) + b_conv2)
        w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1))
        b_conv3 = tf.Variable(tf.truncated_normal([64], stddev=0.01))
        conv3 = tf.nn.relu(conv_layer(conv2, w_conv3) + b_conv3)
        pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(
            conv3.shape[3])
        conv3 = tf.reshape(conv3, [-1, pulling_size])
        w_fc1 = tf.Variable(
            tf.truncated_normal([pulling_size, 256], stddev=0.1))
        b_fc1 = tf.Variable(tf.truncated_normal([256], stddev=0.01))
        w_fc2 = tf.Variable(tf.truncated_normal([256, 2], stddev=0.1))
        b_fc2 = tf.Variable(tf.truncated_normal([2], stddev=0.01))
        fc_1 = tf.nn.relu(tf.matmul(conv3, w_fc1) + b_fc1)
        self.logits = tf.matmul(fc_1, w_fc2) + b_fc2
        self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []
コード例 #17
0
    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState

        def conv_layer(x, conv, stride=1):
            return tf.nn.conv2d(x,
                                conv, [1, stride, stride, 1],
                                padding='SAME')

        def pooling(x, k=2, stride=2):
            return tf.nn.max_pool(x,
                                  ksize=[1, k, k, 1],
                                  strides=[1, stride, stride, 1],
                                  padding='SAME')

        self.X = tf.placeholder(tf.float32, [None, 80, 80, 4])
        self.REWARDS = tf.placeholder(tf.float32, (None))
        self.ACTIONS = tf.placeholder(tf.int32, (None))
        w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1))
        conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4))
        pooling1 = pooling(conv1)
        w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1))
        conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2))
        w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1))
        conv3 = tf.nn.relu(conv_layer(conv2, w_conv3))
        pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(
            conv3.shape[3])
        conv3 = tf.reshape(conv3, [-1, pulling_size])
        w_fc1 = tf.Variable(
            tf.truncated_normal([pulling_size, 256], stddev=0.1))
        w_fc2 = tf.Variable(tf.truncated_normal([256, 2], stddev=0.1))
        fc_1 = tf.nn.relu(tf.matmul(conv3, w_fc1))
        self.logits = tf.nn.softmax(tf.matmul(fc_1, w_fc2))
        indexes = tf.range(0,
                           tf.shape(self.logits)[0]) * tf.shape(
                               self.logits)[1] + self.ACTIONS
        responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes)
        self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []
コード例 #18
0
def main():

    env = PLE(Pixelcopter(),
              fps=30,
              display_screen=True,
              state_preprocessor=None)
    action_dim = len(env.getActionSet())
    obs_shape = len(env.getGameState())

    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=action_dim)
    algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.1,  # 有一定概率随机选取动作,探索
        e_greed_decrement=1e-6)  # 随着训练逐步收敛,探索的程度慢慢降低

    # 加载模型
    # save_path = './dqn_model.ckpt'
    # agent.restore(save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 30000

    # 开始训练
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        eval_reward, max_reward = evaluate(env, agent,
                                           render=False)  # render=True 查看显示效果
        logger.info(
            'episode:{}    e_greed:{}   test_reward:{}   max_reward:{}'.format(
                episode, agent.e_greed, eval_reward, max_reward))

    # 训练结束,保存模型
    save_path = './dqn_model.ckpt'
    agent.save(save_path)
コード例 #19
0
ファイル: flappy_agent.py プロジェクト: Miscon/Flappy
    def train(self):
        """ Runs nb_episodes episodes of the game with agent picking the moves.
            An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
        """

        if not os.path.exists(self.name):
            os.mkdir(self.name)

        t = threading.Thread(target=self.draw_plots)
        t.daemon = True
        t.start()

        reward_values = self.reward_values()
        env = PLE(FlappyBird(),
                  fps=30,
                  display_screen=False,
                  force_fps=True,
                  rng=None,
                  reward_values=reward_values)
        env.init()

        score = 0
        while self.frame_count <= 1000000:
            # pick an action
            state1 = env.game.getGameState()
            action = self.training_policy(state1)

            # step the environment
            reward = env.act(env.getActionSet()[action])
            # print("reward=%d" % reward)

            state2 = env.game.getGameState()

            end = env.game_over(
            ) or score >= 100  # Stop after reaching 100 pipes
            self.observe(state1, action, reward, state2, end)

            # reset the environment if the game is over
            if end:
                env.reset_game()
                score = 0

            if self.frame_count % 25000 == 0:
                print("==========================")

                print("episodes done: {}".format(self.episode_count))
                print("frames done: {}".format(self.frame_count))

                self.score()

                with open("{}/agent.pkl".format(self.name), "wb") as f:
                    pickle.dump((self), f, pickle.HIGHEST_PROTOCOL)

                print("==========================")
コード例 #20
0
    def __init__(self, screen=False, forcefps=True):
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game,
                       fps=30,
                       display_screen=screen,
                       force_fps=forcefps)
        self.env.init()
        self.env.getGameState = self.game.getGameState

        def conv_layer(x, conv, stride=1):
            return tf.nn.conv2d(x,
                                conv, [1, stride, stride, 1],
                                padding='SAME')

        def pooling(x, k=2, stride=2):
            return tf.nn.max_pool(x,
                                  ksize=[1, k, k, 1],
                                  strides=[1, stride, stride, 1],
                                  padding='SAME')

        self.X = tf.placeholder(tf.float32, [None, 80, 80, 4])
        self.Y = tf.placeholder(tf.float32, [None, output_size])
        w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1))
        conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4))
        pooling1 = pooling(conv1)
        w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1))
        conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2))
        w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1))
        conv3 = tf.nn.relu(conv_layer(conv2, w_conv3))
        pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(
            conv3.shape[3])
        conv3 = tf.reshape(tf.reshape(conv3, [-1, pulling_size]),
                           [batch_size, 8, 512])
        cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False)
        self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512))
        self.rnn, self.last_state = tf.nn.dynamic_rnn(
            inputs=conv3,
            cell=cell,
            dtype=tf.float32,
            initial_state=self.hidden_layer)
        w = tf.Variable(tf.random_normal([512, output_size]))
        self.logits = tf.matmul(self.rnn[:, -1], w)
        self.cost = tf.reduce_sum(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.LEARNING_RATE).minimize(self.cost)
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        self.rewards = []
コード例 #21
0
 def __init__(self,
              reward_values={},
              reward_discount=0.99,
              pip_gap=100,
              display_screen=True,
              fps=30,
              force_fps=True):
     self.game = PLE(FlappyBird(pipe_gap=pip_gap),
                     reward_values=reward_values,
                     fps=fps,
                     force_fps=force_fps,
                     display_screen=display_screen)
     self.game.init()
     self.actions = self.game.getActionSet()
     self.reward_discount = reward_discount
コード例 #22
0
def run_game(nb_episodes, agent):
    """ Runs nb_episodes episodes of the game with agent picking the moves.
        An episode of FlappyBird ends with the bird crashing into a pipe or going off screen.
    """

    reward_values = {
        "positive": 1.0,
        "negative": 0.0,
        "tick": 0.0,
        "loss": 0.0,
        "win": 0.0
    }
    # TODO: when training use the following instead:
    # reward_values = agent.reward_values

    env = PLE(FlappyBird(),
              fps=30,
              display_screen=False,
              force_fps=True,
              rng=None,
              reward_values=reward_values)
    # TODO: to speed up training change parameters of PLE as follows:
    # display_screen=False, force_fps=True
    env.init()

    score = 0
    tot_nb_episodes = nb_episodes
    average = 0
    highscore = 0
    while nb_episodes > 0:
        # pick an action
        # TODO: for training using agent.training_policy instead
        action = agent.policy(agent.state_binner(env.game.getGameState()))

        # step the environment
        reward = env.act(env.getActionSet()[action])
        #print("reward=%d" % reward)

        # TODO: for training let the agent observe the current state transition

        score += reward

        # reset the environment if the game is over
        if env.game_over():
            average += score
            if score > highscore:
                highscore = score
            print("score for this episode: %d" % score)
            env.reset_game()
            nb_episodes -= 1
            score = 0
    print("Average for 100 runs {}".format(average / tot_nb_episodes))
    return highscore
コード例 #23
0
    def determine_fitness(self, individual):
        """ determine the fitness of the given individual by running a simulation of the game with its encoded agent

        :param individual:
        :return:
        """
        if individual.fitness is None:
            game = FlappyBird()
            p = PLE(game, fps=30, display_screen=False)
            p.init()

            raise NotImplementedError(
                "add your code to determine the fitness of the individual, "
                "you can change the signature of this function")
        else:
            return individual.fitness
コード例 #24
0
def setup_env_agent(display_screen, frame_skip, force_fps, reward_shaping,
                    frame_stack, train):
    game = FlappyBird()
    ple_flappy = PLE(game,
                     fps=30,
                     display_screen=display_screen,
                     frame_skip=frame_skip,
                     force_fps=force_fps)
    if reward_shaping and train:
        z = ple_flappy.game.rewards
        z['tick'] = 0.1
        ple_flappy.game.adjustRewards(z)
    ple_flappy.init()
    agent = DQNAgent(ple_flappy.getActionSet(), frame_stack=frame_stack)

    return ple_flappy, agent
コード例 #25
0
    def __init__(self, display_screen=True):

        self.game_state = PLE(AngryBird(render=display_screen),
                              fps=30,
                              display_screen=display_screen)
        #self.game_state.init()

        self.display_screen = display_screen
        self._action_set = self.game_state.getActionSet()

        self.action_space = spaces.Discrete(len(self._action_set))
        self.screen_height, self.screen_width = self.game_state.getScreenDims()

        self.observation_space = spaces.Box(low=0, high=255, \
                                            shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
        self.viewer = None
コード例 #26
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
コード例 #27
0
def play_flappy_bird(play_game=True,
                     train_agent=False,
                     agent_model_path='model_backup.h5'):
    game = FlappyBird()
    environment = PLE(game, fps=30, display_screen=True)
    # agent_explored_states = FlappyBirdAgent()
    action_len = 2
    states = []
    for key, value in game.getGameState().iteritems():
        states.append(value)
    print(states)
    state_len = len(states)

    agent_explored_states = FlappyBirdAgent(state_len, action_len)

    if os.path.exists(agent_model_path):
        agent_explored_states.load_agent_experience(agent_model_path)
        agent_explored_states.model_loaded = True
        print("WEights loaded")
    # environment.init()
    if train_agent:
        agent_explored_states.train(environment, game)
        print("Trained")
    if play_game:
        agent_explored_states.play(environment, game)
        print("Played")
コード例 #28
0
 def __init__(self, game_name='FlappyBird', display_screen=True):
     # set headless mode
     os.environ['SDL_VIDEODRIVER'] = 'dummy'
     
     # open up a game state to communicate with emulator
     import importlib
     game_module_name = ('ple.games.%s' % game_name).lower()
     game_module = importlib.import_module(game_module_name)
     game = getattr(game_module, game_name)()
     self.game_state = PLE(game, fps=30, display_screen=display_screen)
     self.game_state.init()
     self._action_set = self.game_state.getActionSet()
     self.action_space = spaces.Discrete(len(self._action_set))
     self.screen_width, self.screen_height = self.game_state.getScreenDims()
     self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3))
     self.viewer = None
コード例 #29
0
ファイル: game.py プロジェクト: thexl74/FlappyBirdReinforce
 def __init__(self, game="pixelcopter", fps=30):
     os.environ['SDL_VIDEODRIVER'] = 'dummy'
     self.game_name = game
     if game == "flappy":
         engine = FlappyBird()
     elif game == "pixelcopter":
         engine = Pixelcopter()
     else:
         assert False, "This game is not available"
     engine.rewards["loss"] = -5  # reward at terminal state
     self.reward_terminal = -5
     self.game = PLE(engine, fps=fps, display_screen=False)
     self.game.init()
     self.game.act(0)  # Start the game by providing arbitrary key as input
     self.key_input = self.game.getActionSet()
     self.reward = 0
コード例 #30
0
 def run_a_game(self, game):
     from ple import PLE
     p = PLE(game, display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward, obs))
コード例 #31
0
def test_movement_up():
    game = Pong()
    p = PLE(game, display_screen=True, fps=20, force_fps=1)
    p.init()
    time.sleep(.5)
    oldState = p.getGameState()
    p.act(game.actions["up"])
    newState = p.getGameState()
    assert oldState["player_velocity"] > newState["player_velocity"]
コード例 #32
0
  def __init__(self):
    self.game = FlappyBird(pipe_gap=125)
    self.env = PLE(self.game, fps=30, display_screen=True)
    self.env.init()
    self.env.getGameState = self.game.getGameState # maybe not necessary

    # by convention we want to use (0,1)
    # but the game uses (None, 119)
    self.action_map = self.env.getActionSet() #[None, 119]
コード例 #33
0
def main_naive():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet())

    env.init()
    reward = 0.0
    nb_frames = 10000

    for i in range(nb_frames):
        if env.game_over():
            env.reset_game()

        observation = env.getScreenRGB()
        action = my_agent.pickAction(reward, observation)
        reward = env.act(action)
コード例 #34
0
 def run_a_game(self,game):
     from ple import PLE
     p =  PLE(game,display_screen=True)
     agent = NaiveAgent(p.getActionSet())
     p.init()
     reward = p.act(p.NOOP)
     for i in range(NUM_STEPS):
         obs = p.getScreenRGB()
         reward = p.act(agent.pickAction(reward,obs))
コード例 #35
0
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20):
    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = load_agent(env, agent_file_path, agent_file_name)
    env.init()

    print "Testing model:", agent_file_name

    total_reward = 0.0
    for _ in range(test_rounds):
        my_agent.start_episode()
        episode_reward = 0.0
        while env.game_over() == False:
            state = env.getGameState()
            reward, action = my_agent.act(state, epsilon=0.00)
            episode_reward += reward

        print "Agent score {:0.1f} reward for episode.".format(episode_reward)
        total_reward += episode_reward
        my_agent.end_episode()

    return total_reward/test_rounds
コード例 #36
0
ファイル: PLE_env.py プロジェクト: halofanx/deer
    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()
コード例 #37
0
ファイル: PLE_env.py プロジェクト: halofanx/deer
class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

                
    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1: # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
        
        return [4 * [48 * [48 * [0]]]]
        
        
    def act(self, action):
        action = self._actions[action]
        
        reward = 0
        for _ in range(self._frameSkip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break
            
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
  
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))


    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.uint8

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reducedScreen)]

    def inTerminalState(self):
        return self._ple.game_over()
コード例 #38
0
ファイル: test.py プロジェクト: SiyuanQi/RunningMinion
import numpy as np
import pygame
from pygame.locals import *


class TestAgent():
	def __init__(self, actions):
		self.actions = actions
	def doAction(self,reward,obs):
		#print 'hello'
		for event in pygame.event.get():
			if event.type == KEYDOWN:
				return self.actions[0]
			return None

game = RunningMinion()
#game = WaterWorld()
p = PLE(game, fps=30, display_screen=True)
agent = TestAgent(p.getActionSet())

p.init()
reward = 0.0
nb_frames = 2000

for i in range(nb_frames):
	if p.game_over():
		p.reset_game()
	if i%1==0:
		obser = p.getScreenRGB()
		action = agent.doAction(reward,obser)
		reward = p.act(action)
コード例 #39
0
    epsilon = 0.15 # percentage of time we perform a random action, help exploration.
    epsilon_steps = 30000 #decay steps
    epsilon_min = 0.1
    lr = 0.01
    discount = 0.95 #discount factor
    rng = np.random.RandomState(24)

    #memory settings
    max_memory_size = 100000 
    min_memory_size = 1000 #number needed before model training starts
   
    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    #PLE takes our game and the state_preprocessor. It will process the state for our agent.
    game = Catcher(width=128, height=128) 
    env = PLE(game, fps=60, state_preprocessor=nv_state_preprocessor)

    agent = Agent(env, batch_size, num_frames, frame_skip, lr, 
            discount, rng, optimizer="sgd_nesterov")
    agent.build_model()

    memory = ReplayMemory(max_memory_size, min_memory_size)

    env.init()
    
    for epoch in range(1, num_epochs+1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False
       
        #training loop
コード例 #40
0
import numpy as np
from ple import PLE
from ple.games.waterworld import WaterWorld


# lets adjust the rewards our agent recieves
rewards = {
    "tick": -0.01,  # each time the game steps forward in time the agent gets -0.1
    "positive": 1.0,  # each time the agent collects a green circle
    "negative": -5.0,  # each time the agent bumps into a red circle
}

# make a PLE instance.
# use lower fps so we can see whats happening a little easier
game = WaterWorld(width=256, height=256, num_creeps=8)
p = PLE(game, fps=15, force_fps=False, display_screen=True,
        reward_values=rewards)
# we pass in the rewards and PLE will adjust the game for us

p.init()
actions = p.getActionSet()
for i in range(1000):
    if p.game_over():
        p.reset_game()

    action = actions[np.random.randint(0, len(actions))]  # random actions
    reward = p.act(action)

    print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
コード例 #41
0
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000):
    # training parameters
    num_epochs = 5
    num_steps_train_epoch = num_steps_train_total/num_epochs  # steps per epoch of training
    num_steps_test = 100
    update_frequency = 10  # step frequency of model training/updates

    epsilon = 0.15  # percentage of time we perform a random action, help exploration.
    epsilon_steps = 1000  # decay steps
    epsilon_min = 0.1
    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # memory settings
    max_memory_size = 10000
    min_memory_size = 60  # number needed before model training starts

    game = RunningMinion()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = init_agent(env)

    memory = utils.ReplayMemory(max_memory_size, min_memory_size)
    env.init()

    # Logging configuration and figure plotting
    logging.basicConfig(filename='../learning.log', filemode='w',
                        level=logging.DEBUG, format='%(levelname)s:%(message)s')
    logging.info('========================================================')
    logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n')
    learning_rewards = [0]
    testing_rewards = [0]

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train_epoch:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_train_epoch:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(my_agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if steps < num_steps_train_epoch:
                learning_rewards.append(episode_reward)

            if num_episodes % 5 == 0:
                # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes))

        steps, num_episodes = 0, 0
        losses, rewards = [], []

        # testing loop
        while steps < num_steps_test:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_test:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=0.05)

                episode_reward += reward
                testing_rewards.append(testing_rewards[-1]+reward)
                steps += 1

                # done watching after 500 steps.
                if steps > 500:
                    env.display_screen = False

            if num_episodes % 5 == 0:
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            if steps < num_steps_test:
                testing_rewards.append(episode_reward)

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes))

    logging.info("Training complete.\n\n")
    plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total)
    plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total)

    save_agent(my_agent, agent_file_path, agent_file_name)
コード例 #42
0
game = RaycastMaze(
        map_size=6
        ) #create our game

fps = 30 #fps we want to run at
frame_skip = 2
num_steps = 1
force_fps = False #slower speed
display_screen = True

reward = 0.0
max_noops = 20
nb_frames = 15000

#make a PLE instance.
p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, 
	force_fps=force_fps, display_screen=display_screen)

#our Naive agent!
agent = NaiveAgent(p.getActionSet())

#init agent and game.
p.init()

#lets do a random number of NOOP's
for i in range(np.random.randint(0, max_noops)):
	reward = p.act(p.NOOP)

#start our training loop
for f in range(nb_frames):
	#if the game is over
        if p.game_over():
コード例 #43
0
# env1.reset()
# for _ in range(1000):
#     env.render()
#     env.step(env.action_space.sample())  # take a random action
#     env1.render()
#     env1.step(env1.action_space.sample())  # take a random action

# from ple.games.pong import Pong
# from ple import PLE

# game = Pong()
# p = PLE(game, fps=30, display_screen=True, force_fps=False)
# p.init()
# 
from ple.games.flappybird import FlappyBird
from ple import PLE


game = FlappyBird()
p = PLE(game, fps=30, display_screen=True)

p.init()
reward = 0.0

for i in range(nb_frames):
   if p.game_over():
           p.reset_game()

   observation = p.getScreenRGB()
   action = agent.pickAction(reward, observation)
   reward = p.act(action)
コード例 #44
0
ファイル: run.py プロジェクト: williamjussiau/RLchallenge
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

for i in range(nb_games):
    p.reset_game()
    
    while(not p.game_over()):
        state = game.getGameState()
        screen = p.getScreenRGB()
        action=FlappyPolicy(state, screen) ### Your job is to define this function.
        
        reward = p.act(action)
        cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
コード例 #45
0
ファイル: launcher.py プロジェクト: ntasfi/deep_q_rl
def launch(args, defaults, description):
    """
    Execute a complete training run.
    """

    logging.basicConfig(level=logging.INFO)
    parameters = process_args(args, defaults, description)

    rewards = {}
    
    try:
        module = importlib.import_module("ple.games.%s" % parameters.game.lower())
        game = getattr(module, parameters.game)
        if parameters.game == "FlappyBird":
            game = game()
        elif parameters.game == "WaterWorld":
            game = game(width=84, height=84, num_creeps=6)
        else:
            game = game(width=84, height=84)
    except:
        raise ValueError("The game %s could not be found. Try using the classname, it is case sensitive." % parameters.game)
    
    if parameters.deterministic:
        rng = np.random.RandomState(123456)
    else:
        rng = np.random.RandomState()

    if parameters.cudnn_deterministic:
        theano.config.dnn.conv.algo_bwd = 'deterministic'

    env = PLE(
            game,
            fps=60,
            force_fps=parameters.force_fps, 
            display_screen=parameters.display_screen,
            reward_values=rewards,
            rng=rng
    )

    num_actions = len(env.getActionSet())

    if parameters.nn_file is None:
        network = q_network.DeepQLearner(defaults.RESIZED_WIDTH,
                                         defaults.RESIZED_HEIGHT,
                                         num_actions,
                                         parameters.phi_length,
                                         parameters.discount,
                                         parameters.learning_rate,
                                         parameters.rms_decay,
                                         parameters.rms_epsilon,
                                         parameters.momentum,
                                         parameters.clip_delta,
                                         parameters.freeze_interval,
                                         parameters.batch_size,
                                         parameters.network_type,
                                         parameters.update_rule,
                                         parameters.batch_accumulator,
                                         rng)
    else:
        handle = open(parameters.nn_file, 'r')
        network = cPickle.load(handle)

    agent = ple_agent.NeuralAgent(network,
                                  parameters.epsilon_start,
                                  parameters.epsilon_min,
                                  parameters.epsilon_decay,
                                  parameters.replay_memory_size,
                                  parameters.experiment_prefix,
                                  parameters.replay_start_size,
                                  parameters.update_frequency,
                                  rng)

    experiment = ple_experiment.PLEExperiment(env, agent,
                                              defaults.RESIZED_WIDTH,
                                              defaults.RESIZED_HEIGHT,
                                              parameters.resize_method,
                                              parameters.epochs,
                                              parameters.steps_per_epoch,
                                              parameters.steps_per_test,
                                              parameters.frame_skip,
                                              parameters.death_ends_episode,
                                              parameters.max_start_nullops,
                                              rng)

    
    env.init()
    experiment.run()
コード例 #46
0
from ple.games import Doom

class NaiveAgent():
	"""
		This is our naive agent. It picks actions at random!
	"""
	def __init__(self, actions):
		self.actions = actions

	def pickAction(self, reward, obs):
		return self.actions[np.random.randint(0, len(self.actions))]

###################################
game = Doom(scenario="take_cover")

env = PLE(game)
agent = NaiveAgent(env.getActionSet())
env.init()

reward = 0.0
for f in range(15000):
	#if the game is over
        if env.game_over():
            env.reset_game()
            
        action = agent.pickAction(reward, env.getScreenRGB())
        reward = env.act(action)

        if f > 2000:
            env.display_screen = True 
            env.force_fps = False
コード例 #47
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    #setupGame()
    gameClass = FlappyBird(width=288, height=512, pipe_gap=100)
    
    fps = 30
    frame_skip = 2
    num_steps = 1
    force_fps = False
    display_screen = True
    reward = 0.0
    nb_frames = 15000

    game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps,
            force_fps=force_fps, display_screen=display_screen)

    game.init()

    # store the previous observations in replay memory
    D = deque()

    # printing
    logdir = "logs_" + GAME
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    a_file = open(logdir + "/readout.txt", 'w')
    h_file = open(logdir + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    r_0 = game.act(game.NOOP)
    x_t = game.getScreenGrayscale()
    terminal = game.game_over()
    if terminal:
        print "NOOOO"
        game.reset_game()
    
    x_t = cv2.resize(x_t, (80, 80))
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    '''
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
    '''
    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            r_t = game.act(np.argmax(a_t))
            x_t1 = game.getScreenGrayscale()
            terminal = game.game_over()
            if terminal:
                print "NOOO2"
                game.reset_game()

            x_t1 = cv2.resize(x_t1, (80, 80))
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        # write info to files
        '''