def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'field':
        import gym_fieldedmove
        env = gym.make('FieldedMove-v0')
        # env = FrameStack(env, 4)
    elif args["env_kind"] == "ple":
        import gym_ple
        env = gym.make(args['env'])
        env._max_episode_steps = args['max_episode_steps']
        # env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()
    elif args["env_kind"] == "my_games":
        env = gym.make(args['env'])
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env)
        env = FrameStack(env, 4)

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Exemple #3
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == "atari":
        env = gym.make(args["env"])
        assert "NoFrameskip" in env.spec.id
        # from self-supervised exploration via disagreement
        if args["stickyAtari"] == "true":
            env = StickyActionEnv(env)
        env._max_episode_steps = args["max_episode_steps"] * 4
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args["max_episode_steps"])
        if "Montezuma" in args["env"]:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapper(env)
        # assert env.action_space == spaces.Discrete(7)
    elif args["env_kind"] == "mario":
        env = make_mario_env()
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapperMario(env)
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == "robopong":
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), "%.2i" % rank))
    return env
def main():
    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False
    # for i in range(episode_count):
    #   ob = env.reset()
    #   while True:
    #     action = agent.act(ob, reward, done)
    #     ob, reward, done, _ = env.step(1)
    #     if done:
    #       break

    for i in range(episode_count):
        ob = env.reset()
        while True:
            key = readchar.readkey()
            # Choose an action from keyboard
            if key not in arrow_keys.keys():
                print("Game aborted!")
                break
            action = arrow_keys[key]
            state, reward, done, info = env.step(action)

            if done:
                print("Finished with reward", reward)
                break
Exemple #5
0
def make_env_all_params(rank, add_monitor, args, sleep_multiple=2):
    if args["env_kind"] == 'ObstacleTowerEnv':
        env = _make_obs_env(rank, add_monitor, args, sleep_multiple)
    elif args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if rank == 2:
            env = RenderWrapper(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        logdir = osp.join('summaries', args["exp_name"])
        logger.configure(logdir)
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Exemple #6
0
def main():
    FLAGS(sys.argv)
    # Choose which RL algorithm to train.

    print("env : %s" % FLAGS.env)

    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if args["env_kind"] == 'atari' and add_monitor:
        #env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
        env = Monitor(env,
                      os.path.join(os.getcwd(), 'test_video'),
                      force=True,
                      video_callable=lambda episode_id: episode_id % 20 == 0)
        #env = Monitor(env, os.path.join(os.getcwd(), 'test_video'),video_callable=lambda episode_id: True )#,force=True)

    return env
Exemple #8
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        if args["stickyAtari"]:  # 在智能体执行动作时增加随机性
            env._max_episode_steps = args['max_episode_steps'] * 4
            env = StickyActionEnv(env)
        else:
            env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)  # 每个动作连续执行4步
        env = ProcessFrame84(env, crop=False)  # 处理观测
        env = FrameStack(env, 4)  # 将连续4帧叠加起来作为输入
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if not args["stickyAtari"]:
            env = ExtraTimeLimit(env,
                                 args['max_episode_steps'])  # 限制了一个周期的最大时间步
        if 'Montezuma' in args['env']:  # 记录智能体的位置, 所在的房间, 已经访问的房间
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':  # 超级马里奥
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":  # 多智能体游戏, Multi-Pong
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
def make_env_all_params(rank, add_monitor, args, logdir):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()
    elif args["env_kind"] == "dm_suite":
        env = make_dm_suite(task=args["env"],
                            logdir=logdir,
                            to_record=args["to_record"])

    if add_monitor:
        env = TempMonitor(env)

    return env
Exemple #10
0
 def make_env_all_params(rank, add_monitor=True):
     env = gym.make("MontezumaRevengeNoFrameskip-v4")
     assert 'NoFrameskip' in env.spec.id
     env._max_episode_steps = 4500 * 4
     env = StickyActionEnv(env)
     env = MaxAndSkipEnv(env, skip=4)  # 每个动作连续执行4步
     env = ProcessFrame84(env, crop=False)  # 处理观测
     env = FrameStack(env, 4)  # 将连续4帧叠加起来作为输入
     return env
Exemple #11
0
 def _thunk():
     env = gym.make(env_id)
     env = ToDiscreteWrapper(env)
     env = ProcessFrame84(env)
     env.seed(seed + rank)
     env = bench.Monitor(
         env,
         logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     gym.logger.setLevel(logging.WARN)
     return env
Exemple #12
0
def main():
    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym.make(FLAGS.env)
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    if (FLAGS.algorithm == "deepq"):

        act = deepq.load("models/deepq/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = act(history)[0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))

            print("Episode reward", episode_rew)

    elif (FLAGS.algorithm == "acktr"):

        policy_fn = CnnPolicy
        model = acktr_disc.load(policy_fn,
                                env,
                                seed=0,
                                total_timesteps=1,
                                nprocs=4,
                                filename="models/acktr/%s" % FLAGS.file)
        nstack = 4
        nh, nw, nc = env.observation_space.shape
        history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

        while True:
            obs, done = env.reset(), False
            history = update_history(history, obs)
            episode_rew = 0
            while not done:
                env.render()
                action = model.step(history)[0][0]
                obs, rew, done, _ = env.step(action)
                history = update_history(history, obs)
                episode_rew += rew
                print("action : %s reward : %s" % (action, rew))
            print("Episode reward", episode_rew)
Exemple #13
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    sess = tf.Session()
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(var_list=dqn_var_list)
    saver.restore(sess, os.path.join(checkpoint_dir, save_file_name))

    for eps in range(MAX_EPISODES):
        done = False
        step_count = 0
        state = env.reset()

        state_queue = deque(maxlen=4)
        state_queue.append(state)

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                continue

            action = np.argmax(
                targetDQN.predict(
                    np.reshape(np.array(state_queue), [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)
            state_queue.append(next_state)
Exemple #14
0
 def __init__(self, env_name, num_episodes, exp_name, policy):
     self.exp_name = exp_name
     self.env = gym.make(env_name)
     self.env = ProcessFrame84(self.env, crop=False)
     self.env = FrameStack(self.env, 4)
     self.num_episodes = 1
     self.policy = policy
     if not os.path.exists('images'):
         os.mkdir('images')
     self.image_folder = os.path.join(
         os.path.abspath(os.path.dirname(__file__)), 'images')
     print('Image folder', self.image_folder)
Exemple #15
0
def make_specific_env(rank, add_monitor, args):
    from baselines import logger

    multi_train_envs = args['multi_train_envs']
    env_index = rank  // (args['envs_per_process'] // len(multi_train_envs))
    env = gym.make(args['multi_train_envs'][env_index])
    env = ProcessFrame84(env, crop=False)
    env = FrameStack(env, 4)
    #env = DeepmindLabInfo(env, args['multi_train_envs'][env_index])
    print("Made env {}".format(args['multi_train_envs'][env_index]))
    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Exemple #16
0
def make_env_all_params(rank, args):
    env = gym.make(GAME_NAME)
    env = NoopResetEnv(env, noop_max=NOOP_MAX)
    env = MaxAndSkipEnv(env, skip=4)
    env = ProcessFrame84(env, crop=False)
    env = FrameStack(env, 4)
    # env = ExtraTimeLimit(env,10000)
    env = AddRandomStateToInfo(env)
    env = Monitor(
        env,
        os.path.join(
            'C:/Users/Elias/OneDrive/Winfo Studium/SS19/Masterarbeit/logs',
            '%.2i' % rank))
    return env
Exemple #17
0
        def _thunk():
            # 1. Create gym environment
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            # 2. Apply action space wrapper
            env = MarioActionSpaceWrapper(env)
            # 3. Apply observation space wrapper to reduce input size
            env = ProcessFrame84(env)

            return env
Exemple #18
0
def run_random_agent(env_id, episodes):
    done = False
    agent = RandomAgent()

    for i in range(episodes):
        env = gym.make(env_id)
        env = ToDiscreteWrapper(env)
        env = ProcessFrame84(env)

        env.reset()
        while True:
            action = agent.act(env.action_space)
            _, reward, done, _ = env.step(action)
            print('reward: {}'.format(reward))
            if done:
                break
Exemple #19
0
def train_dqn(env_id, num_timesteps):
    """Train a dqn model.

      Parameters
      -------
      env_id: environment to train on
      num_timesteps: int
          number of env steps to optimizer for

      """

    # 1. Create gym environment
    env = gym.make(FLAGS.env)

    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)

    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    # 4. Create a CNN model for Q-Function
    model = cnn_to_mlp(
      convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
      hiddens=[256],
      dueling=FLAGS.dueling
    )

    # 5. Train the model
    act = deepq.learn(
        env,
        q_func=model,
        lr=FLAGS.lr,
        max_timesteps=FLAGS.timesteps,
        buffer_size=10000,
        exploration_fraction=FLAGS.exploration_fraction,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=FLAGS.prioritized,
        callback=deepq_callback
    )
    act.save("mario_model.pkl")
    env.close()
Exemple #20
0
def run_a2c_agent(env_id, seed):
    env = gym.make(env_id)
    env = ToDiscreteWrapper(env)
    env = ProcessFrame84(env)
    env.seed(seed)
    env = SubprocVecEnv([make_env(0, env_id, seed)])
    run(LstmPolicy,
        env,
        seed,
        nsteps=5,
        nstack=4,
        total_timesteps=int(5000),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='constant',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100)
Exemple #21
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        #print(osp.join(logger.get_dir(), '%.2i' % rank + '.monitor.csv'))

        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
        """
        env = DummyVecEnv([lambda: env])
        
        env = VecVideoRecorder(env, directory = './vid',
                       record_video_trigger=lambda step: step == 0,
                       video_length= 100,)
        
        env.reset()
        """
        #env = wrappers.Monitor(env,'./vid/',force = True,write_upon_reset = True, video_callable=lambda episode: True)
        #print(osp.join(logger.get_dir()))
        #env = Monitor(env, osp.join(logger.get_dir()))
        #env = Monitor(env,  "./vid", video_callable=lambda episode_id: True,force=True)
    return env
    def make_atari_env(self, args):
        """
        duplicated code hack due to
        relative import errors
        """

        env = gym.make(args["env"])
        assert "NoFrameskip" in env.spec.id
        # from self-supervised exploration via disagreement
        if args["stickyAtari"] == "true":
            env = StickyActionEnv(env)
        env._max_episode_steps = args["max_episode_steps"] * 4
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStackNoLazy(env, 4)
        env = ExtraTimeLimit(env, args["max_episode_steps"])
        if "Montezuma" in args["env"]:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapper(env)
        return env
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        if args["stickyAtari"]:
            env._max_episode_steps = args['max_episode_steps'] * 4
            env = StickyActionEnv(env)
        else:
            env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        if not args["stickyAtari"]:
            env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'unity':
        env = make_unity_maze(args["env"],
                              seed=args["seed"],
                              rank=rank,
                              ext_coeff=args["ext_coeff"],
                              recordUnityVid=args['recordUnityVid'],
                              expID=args["unityExpID"],
                              startLoc=args["startLoc"],
                              door=args["door"],
                              tv=args["tv"],
                              testenv=args["testenv"],
                              logdir=logger.get_dir())

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Exemple #24
0
def main():

    MAX_BUFFER_SIZE = 100000
    MAX_EPISODES = 10000
    TRAIN_EPISODE = 100
    TARGET_UPDATE_EPS = 1000

    batch_size = 32
    n_size = 84
    discount = 0.99

    checkpoint_dir = './checkpoints'
    save_file_name = 'mario_weight_2.ckpt'

    # 1. Create gym environment
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # 2. Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # 3. Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha)
    replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE)
    sess = tf.Session()

    mainDQN = DQN(sess, name="main")
    targetDQN = DQN(sess, name="target")
    dqn_var_list = targetDQN.var_list

    sess.run(tf.global_variables_initializer())

    copy_ops = get_copy_var_ops(dest_scope_name="target",
                                src_scope_name="main")
    sess.run(copy_ops)

    saver = tf.train.Saver(var_list=dqn_var_list)

    for eps in range(MAX_EPISODES):
        # decaying epsilon greedy
        e = 1. / ((eps / 10) + 1)
        done = False
        step_count = 0
        state = env.reset()
        state_queue = deque(maxlen=4)
        next_state_queue = deque(maxlen=4)

        state_queue.append(state)
        next_state_queue.append(state)

        prev_100 = 0
        curr_100 = 0

        while not done:
            step_count += 1

            # cumulate 4 frames
            if step_count < 4:
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                state_queue.append(next_state)
                next_state_queue.append(next_state)
                continue

            # training starts
            if np.random.rand() < e:
                action = env.action_space.sample()
            else:
                # Choose an action by greedily from the Q-network
                action = np.argmax(
                    mainDQN.predict(
                        np.reshape(np.array(state_queue),
                                   [1, n_size, n_size, 4])))

            # Get new state and reward from environment
            next_state, reward, done, _ = env.step(action)

            if done:  # Penalty
                reward = -100

            curr_100 += reward

            next_state_queue.append(next_state)

            replay_buffer.add(np.array(state_queue), action, reward,
                              np.array(next_state_queue), done)

            if step_count % TRAIN_EPISODE == 0:
                states, actions, rewards, next_states, _ = replay_buffer.sample(
                    batch_size)
                states, next_states = np.reshape(
                    states, [batch_size, n_size, n_size, 4]), np.reshape(
                        next_states, [batch_size, n_size, n_size, 4])

                Q_t = targetDQN.predict(next_states)
                Q_m = mainDQN.predict(states)
                Q_t = np.max(Q_t, axis=1)

                estimates = rewards + discount * Q_t
                Q_m[np.arange(batch_size), actions] = estimates

                loss = mainDQN.update(states, Q_m)
                print("eps: {} step: {} loss: {}".format(
                    eps, step_count, loss))

                if curr_100 > prev_100:
                    save_path = saver.save(
                        sess, os.path.join(checkpoint_dir, save_file_name))
                    print("Model saved in file: %s" % save_path)

                prev_100 = curr_100
                curr_100 = 0

            if step_count % TARGET_UPDATE_EPS == 0:
                sess.run(copy_ops)

            state_queue.append(next_state)
    ActionLoggingWrapper,
)


def get_car_mask(frames, car_color=np.array([223, 183, 85])):
    mask = np.zeros(shape=frames[0].shape)
    for a_frame in frames:
        for i in range(a_frame.shape[0]):
            for j in range(a_frame.shape[1]):
                if np.array_equal(a_frame[i][j], car_color):
                    mask[i][j] += 1
    return mask


env = gym.make("BankHeistNoFrameskip-v4")
env = gym.wrappers.Monitor(env, "./video/", force=True)
env._max_episode_steps = 4000 * 4
env = MaxAndSkipEnv(env, skip=4)
env = ProcessFrame84(env, crop=False)
env = FrameStack(env, 4)
env = ExtraTimeLimit(env, 4000)
env = AddRandomStateToInfo(env)

obs = env.reset()

for _ in range(100):
    obs, reward, done, info = env.step(env.action_space.sample())
    import pdb

    pdb.set_trace()
Exemple #26
0
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op


if __name__ == "__main__":
    # 환경과 DQN 에이전트 생성
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    # Apply action space wrapper
    env = MarioActionSpaceWrapper(env)
    # Apply observation space wrapper to reduce input size
    env = ProcessFrame84(env)

    n_action = 5
    agent = DQNAgent(n_action=n_action)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()  # [224, 256, 3] -> [84, 84, 1]

        # breakout 예제에서 처음에 구석으로 몰리는 것을 방지하기 위함
        #for _ in range(random.randint(1, agent.no_op_steps)): # 1 ~ np_op_steps(30) 까지의 수중 하나를 고른다. 그 후 그 수만큼 for문 돌림.