Exemple #1
0
def main():
    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    if (FLAGS.algorithm == "deepq"):

        act = deepq.load("models/deepq/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = act(history)[0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))

            print("Episode reward", episode_rew)

    elif (FLAGS.algorithm == "acktr"):

        policy_fn = CnnPolicy
        model = acktr_disc.load(policy_fn,
                                env,
                                seed=0,
                                total_timesteps=1,
                                nprocs=4,
                                filename="models/acktr/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = model.step(history)[0][0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))
            print("Episode reward", episode_rew)
Exemple #2
0
def main():
    FLAGS(sys.argv)
    # Choose which RL algorithm to train.

    print("env : %s" % FLAGS.env)

    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
def main():
    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False
    # for i in range(episode_count):
    #   ob = env.reset()
    #   while True:
    #     action = agent.act(ob, reward, done)
    #     ob, reward, done, _ = env.step(1)
    #     if done:
    #       break

    for i in range(episode_count):
        ob = env.reset()
        while True:
            key = readchar.readkey()
            # Choose an action from keyboard
            if key not in arrow_keys.keys():
                print("Game aborted!")
                break
            action = arrow_keys[key]
            state, reward, done, info = env.step(action)

            if done:
                print("Finished with reward", reward)
                break
Exemple #4
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    sess = tf.Session()
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(var_list=dqn_var_list)
    saver.restore(sess, os.path.join(checkpoint_dir, save_file_name))

    for eps in range(MAX_EPISODES):
        done = False
        step_count = 0
        state = env.reset()

        state_queue = deque(maxlen=4)
        state_queue.append(state)

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                continue

            action = np.argmax(
                targetDQN.predict(
                    np.reshape(np.array(state_queue), [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)
            state_queue.append(next_state)
Exemple #5
0
    # Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    n_action = 5
    agent = DQNAgent(n_action=n_action)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()  # [224, 256, 3] -> [84, 84, 1]

        # breakout 예제에서 처음에 구석으로 몰리는 것을 방지하기 위함
        #for _ in range(random.randint(1, agent.no_op_steps)): # 1 ~ np_op_steps(30) 까지의 수중 하나를 고른다. 그 후 그 수만큼 for문 돌림.
        #    observe, _, _, _ = env.step(1)

        # state = pre_processing(observe)
        history = np.stack((observe, observe, observe, observe),
                           axis=2)  # 맨처음엔 같은 화면 4개를 history로
        history = np.reshape([history], (1, 84, 84, 4))

        reward_count = 0  # 계속 같은 곳에 머물 경우를 체크

        while not done:
            if agent.render:
                env.render()
    # Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # Apply observation space wrapper to reduce input size [224, 256, 3] -> [84, 84, 1]
    # env = ProcessFrame84(env)

    n_action = 7
    agent = DQNAgent(n_action=n_action)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset() # [13, 16, 1]

        # breakout 예제에서 처음에 구석으로 몰리는 것을 방지하기 위함
        #for _ in range(random.randint(1, agent.no_op_steps)): # 1 ~ np_op_steps(30) 까지의 수중 하나를 고른다. 그 후 그 수만큼 for문 돌림.
        #    observe, _, _, _ = env.step(1)

        # state = pre_processing(observe)
        history = np.stack((observe, observe, observe, observe), axis=2) # 맨처음엔 같은 화면 4개를 history로
        history = np.reshape([history], (1, 13, 16, 4))

        reward_count = 0 # 계속 같은 곳에 머물 경우를 체크

        while not done:
            if agent.render:
                env.render()
            global_step += 1
Exemple #7
0
import gym
import gym_pull
import super_mario_bros

from wrappers import ProcessFrame84, MarioActionSpaceWrapper
#gym_pull.pull('github.com/ppaquette/gym-super-mario')
env = gym.make('ppaquette/meta-SuperMarioBros-v0')
print(env.observation_space, env.action_space)
env = MarioActionSpaceWrapper(env)
print(env.observation_space, env.action_space)
env = ProcessFrame84(env)
print(env.observation_space, env.action_space)

env.reset()
Exemple #8
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight_2.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha)
    replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE)
    sess = tf.Session()

    mainDQN = DQN(sess, name="main")
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    copy_ops = get_copy_var_ops(dest_scope_name="target",
                                src_scope_name="main")
    sess.run(copy_ops)

    saver = tf.train.Saver(var_list=dqn_var_list)

    for eps in range(MAX_EPISODES):
        # decaying epsilon greedy
        e = 1. / ((eps / 10) + 1)
        done = False
        step_count = 0
        state = env.reset()
        state_queue = deque(maxlen=4)
        next_state_queue = deque(maxlen=4)

        state_queue.append(state)
        next_state_queue.append(state)

        prev_100 = 0
        curr_100 = 0

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                next_state_queue.append(next_state)
                continue

            # training starts
            if np.random.rand() < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(
                    mainDQN.predict(
                        np.reshape(np.array(state_queue),
                                   [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)

            if done:  # Penalty
                reward = -100

            curr_100 += reward

            next_state_queue.append(next_state)

            replay_buffer.add(np.array(state_queue), action, reward,
                              np.array(next_state_queue), done)

            if step_count % TRAIN_EPISODE == 0:
                states, actions, rewards, next_states, _ = replay_buffer.sample(
                    batch_size)
                states, next_states = np.reshape(
                    states, [batch_size, n_size, n_size, 4]), np.reshape(
                        next_states, [batch_size, n_size, n_size, 4])

                Q_t = targetDQN.predict(next_states)
                Q_m = mainDQN.predict(states)
                Q_t = np.max(Q_t, axis=1)

                estimates = rewards + discount * Q_t
                Q_m[np.arange(batch_size), actions] = estimates

                loss = mainDQN.update(states, Q_m)
                print("eps: {} step: {} loss: {}".format(
                    eps, step_count, loss))

                if curr_100 > prev_100:
                    save_path = saver.save(
                        sess, os.path.join(checkpoint_dir, save_file_name))
                    print("Model saved in file: %s" % save_path)

                prev_100 = curr_100
                curr_100 = 0

            if step_count % TARGET_UPDATE_EPS == 0:
                sess.run(copy_ops)

            state_queue.append(next_state)