Exemple #1
0
def main():
    FLAGS(sys.argv)
    # Choose which RL algorithm to train.

    print("env : %s" % FLAGS.env)

    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
def main():
    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False
    # for i in range(episode_count):
    #   ob = env.reset()
    #   while True:
    #     action = agent.act(ob, reward, done)
    #     ob, reward, done, _ = env.step(1)
    #     if done:
    #       break

    for i in range(episode_count):
        ob = env.reset()
        while True:
            key = readchar.readkey()
            # Choose an action from keyboard
            if key not in arrow_keys.keys():
                print("Game aborted!")
                break
            action = arrow_keys[key]
            state, reward, done, info = env.step(action)

            if done:
                print("Finished with reward", reward)
                break
Exemple #3
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    sess = tf.Session()
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(var_list=dqn_var_list)
    saver.restore(sess, os.path.join(checkpoint_dir, save_file_name))

    for eps in range(MAX_EPISODES):
        done = False
        step_count = 0
        state = env.reset()

        state_queue = deque(maxlen=4)
        state_queue.append(state)

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                continue

            action = np.argmax(
                targetDQN.predict(
                    np.reshape(np.array(state_queue), [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)
            state_queue.append(next_state)
Exemple #4
0
def main():
    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    if (FLAGS.algorithm == "deepq"):

        act = deepq.load("models/deepq/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = act(history)[0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))

            print("Episode reward", episode_rew)

    elif (FLAGS.algorithm == "acktr"):

        policy_fn = CnnPolicy
        model = acktr_disc.load(policy_fn,
                                env,
                                seed=0,
                                total_timesteps=1,
                                nprocs=4,
                                filename="models/acktr/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = model.step(history)[0][0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))
            print("Episode reward", episode_rew)
Exemple #5
0
def train_dqn(env_id, num_timesteps):
    """Train a dqn model.

      Parameters
      -------
      env_id: environment to train on
      num_timesteps: int
          number of env steps to optimizer for

      """

    # 1. Create gym environment
    env = gym.make(FLAGS.env)

    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)

    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    # 4. Create a CNN model for Q-Function
    model = cnn_to_mlp(
      convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
      hiddens=[256],
      dueling=FLAGS.dueling
    )

    # 5. Train the model
    act = deepq.learn(
        env,
        q_func=model,
        lr=FLAGS.lr,
        max_timesteps=FLAGS.timesteps,
        buffer_size=10000,
        exploration_fraction=FLAGS.exploration_fraction,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=FLAGS.prioritized,
        callback=deepq_callback
    )
    act.save("mario_model.pkl")
    env.close()
Exemple #6
0
        def _thunk():
            # 1. Create gym environment
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            # 2. Apply action space wrapper
            env = MarioActionSpaceWrapper(env)
            # 3. Apply observation space wrapper to reduce input size
            env = ProcessFrame84(env)

            return env
Exemple #7
0
        summary_placeholders = [
            tf.placeholder(tf.float32) for _ in range(len(summary_vars))
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op


if __name__ == "__main__":
    # 환경과 DQN 에이전트 생성
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    n_action = 5
    agent = DQNAgent(n_action=n_action)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()  # [224, 256, 3] -> [84, 84, 1]
        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

if __name__ == "__main__":
    # 환경과 DQN 에이전트 생성
    # 환경 종류 : SuperMarioBros-1-1-v0([224, 226, 3]), SuperMarioBros-1-1-Tiles-v0([13, 16, 1])
    #           meta-SuperMarioBros-Tiles-v0 (클리어 시 다음 스테이지로)
    env = gym.make("ppaquette/SuperMarioBros-1-1-Tiles-v0")
    # Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # Apply observation space wrapper to reduce input size [224, 256, 3] -> [84, 84, 1]
    # env = ProcessFrame84(env)

    n_action = 7
    agent = DQNAgent(n_action=n_action)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset() # [13, 16, 1]
Exemple #9
0
import gym
import gym_pull
import super_mario_bros

from wrappers import ProcessFrame84, MarioActionSpaceWrapper
#gym_pull.pull('github.com/ppaquette/gym-super-mario')
env = gym.make('ppaquette/meta-SuperMarioBros-v0')
print(env.observation_space, env.action_space)
env = MarioActionSpaceWrapper(env)
print(env.observation_space, env.action_space)
env = ProcessFrame84(env)
print(env.observation_space, env.action_space)

env.reset()
Exemple #10
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight_2.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha)
    replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE)
    sess = tf.Session()

    mainDQN = DQN(sess, name="main")
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    copy_ops = get_copy_var_ops(dest_scope_name="target",
                                src_scope_name="main")
    sess.run(copy_ops)

    saver = tf.train.Saver(var_list=dqn_var_list)

    for eps in range(MAX_EPISODES):
        # decaying epsilon greedy
        e = 1. / ((eps / 10) + 1)
        done = False
        step_count = 0
        state = env.reset()
        state_queue = deque(maxlen=4)
        next_state_queue = deque(maxlen=4)

        state_queue.append(state)
        next_state_queue.append(state)

        prev_100 = 0
        curr_100 = 0

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                next_state_queue.append(next_state)
                continue

            # training starts
            if np.random.rand() < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(
                    mainDQN.predict(
                        np.reshape(np.array(state_queue),
                                   [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)

            if done:  # Penalty
                reward = -100

            curr_100 += reward

            next_state_queue.append(next_state)

            replay_buffer.add(np.array(state_queue), action, reward,
                              np.array(next_state_queue), done)

            if step_count % TRAIN_EPISODE == 0:
                states, actions, rewards, next_states, _ = replay_buffer.sample(
                    batch_size)
                states, next_states = np.reshape(
                    states, [batch_size, n_size, n_size, 4]), np.reshape(
                        next_states, [batch_size, n_size, n_size, 4])

                Q_t = targetDQN.predict(next_states)
                Q_m = mainDQN.predict(states)
                Q_t = np.max(Q_t, axis=1)

                estimates = rewards + discount * Q_t
                Q_m[np.arange(batch_size), actions] = estimates

                loss = mainDQN.update(states, Q_m)
                print("eps: {} step: {} loss: {}".format(
                    eps, step_count, loss))

                if curr_100 > prev_100:
                    save_path = saver.save(
                        sess, os.path.join(checkpoint_dir, save_file_name))
                    print("Model saved in file: %s" % save_path)

                prev_100 = curr_100
                curr_100 = 0

            if step_count % TARGET_UPDATE_EPS == 0:
                sess.run(copy_ops)

            state_queue.append(next_state)