Exemple #1
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    sess = tf.Session()
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(var_list=dqn_var_list)
    saver.restore(sess, os.path.join(checkpoint_dir, save_file_name))

    for eps in range(MAX_EPISODES):
        done = False
        step_count = 0
        state = env.reset()

        state_queue = deque(maxlen=4)
        state_queue.append(state)

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                continue

            action = np.argmax(
                targetDQN.predict(
                    np.reshape(np.array(state_queue), [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)
            state_queue.append(next_state)
Exemple #2
0
def main():
    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    if (FLAGS.algorithm == "deepq"):

        act = deepq.load("models/deepq/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = act(history)[0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))

            print("Episode reward", episode_rew)

    elif (FLAGS.algorithm == "acktr"):

        policy_fn = CnnPolicy
        model = acktr_disc.load(policy_fn,
                                env,
                                seed=0,
                                total_timesteps=1,
                                nprocs=4,
                                filename="models/acktr/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = model.step(history)[0][0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))
            print("Episode reward", episode_rew)
Exemple #3
0
def main():
    FLAGS(sys.argv)
    # Choose which RL algorithm to train.

    print("env : %s" % FLAGS.env)

    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
def main():
    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False
    # for i in range(episode_count):
    #   ob = env.reset()
    #   while True:
    #     action = agent.act(ob, reward, done)
    #     ob, reward, done, _ = env.step(1)
    #     if done:
    #       break

    for i in range(episode_count):
        ob = env.reset()
        while True:
            key = readchar.readkey()
            # Choose an action from keyboard
            if key not in arrow_keys.keys():
                print("Game aborted!")
                break
            action = arrow_keys[key]
            state, reward, done, info = env.step(action)

            if done:
                print("Finished with reward", reward)
                break
Exemple #5
0
            action = agent.get_action(history)

            # action
            if action == 0:
                real_action = [0, 0, 0, 1, 1, 1]  # Right + A + B
            elif action == 1:
                real_action = [0, 0, 0, 0, 1, 0]  # A
            elif action == 2:
                real_action = [0, 0, 0, 1, 1, 0]  # Right + A
            elif action == 3:
                real_action = [0, 0, 0, 1, 0, 1]  # Right + B
            elif action == 4:
                real_action = [0, 0, 0, 0, 1, 1]  # A + B

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            next_observe, reward, done, info = env.step(real_action)

            # 각 타임스텝마다 상태 전처리
            # next_state = pre_processing(next_observe)
            next_state = np.reshape([next_observe], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)

            agent.avg_q_max += np.amax(
                agent.model.predict(np.float32(history))[0])

            reward = np.clip(reward, -1., 1.)  # reward를 -1 ~ 1 사이의 값으로 만듬

            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
            agent.append_sample(history, action, reward, next_history, dead)
            # print ("global_step : " ,global_step)
Exemple #6
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight_2.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha)
    replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE)
    sess = tf.Session()

    mainDQN = DQN(sess, name="main")
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    copy_ops = get_copy_var_ops(dest_scope_name="target",
                                src_scope_name="main")
    sess.run(copy_ops)

    saver = tf.train.Saver(var_list=dqn_var_list)

    for eps in range(MAX_EPISODES):
        # decaying epsilon greedy
        e = 1. / ((eps / 10) + 1)
        done = False
        step_count = 0
        state = env.reset()
        state_queue = deque(maxlen=4)
        next_state_queue = deque(maxlen=4)

        state_queue.append(state)
        next_state_queue.append(state)

        prev_100 = 0
        curr_100 = 0

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                next_state_queue.append(next_state)
                continue

            # training starts
            if np.random.rand() < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(
                    mainDQN.predict(
                        np.reshape(np.array(state_queue),
                                   [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)

            if done:  # Penalty
                reward = -100

            curr_100 += reward

            next_state_queue.append(next_state)

            replay_buffer.add(np.array(state_queue), action, reward,
                              np.array(next_state_queue), done)

            if step_count % TRAIN_EPISODE == 0:
                states, actions, rewards, next_states, _ = replay_buffer.sample(
                    batch_size)
                states, next_states = np.reshape(
                    states, [batch_size, n_size, n_size, 4]), np.reshape(
                        next_states, [batch_size, n_size, n_size, 4])

                Q_t = targetDQN.predict(next_states)
                Q_m = mainDQN.predict(states)
                Q_t = np.max(Q_t, axis=1)

                estimates = rewards + discount * Q_t
                Q_m[np.arange(batch_size), actions] = estimates

                loss = mainDQN.update(states, Q_m)
                print("eps: {} step: {} loss: {}".format(
                    eps, step_count, loss))

                if curr_100 > prev_100:
                    save_path = saver.save(
                        sess, os.path.join(checkpoint_dir, save_file_name))
                    print("Model saved in file: %s" % save_path)

                prev_100 = curr_100
                curr_100 = 0

            if step_count % TARGET_UPDATE_EPS == 0:
                sess.run(copy_ops)

            state_queue.append(next_state)