Example #1
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()
    elif args["env_kind"] == "my_games":
        env = gym.make(args['env'])
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env)
        env = FrameStack(env, 4)

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Example #2
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == "atari":
        env = gym.make(args["env"])
        assert "NoFrameskip" in env.spec.id
        # from self-supervised exploration via disagreement
        if args["stickyAtari"] == "true":
            env = StickyActionEnv(env)
        env._max_episode_steps = args["max_episode_steps"] * 4
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args["max_episode_steps"])
        if "Montezuma" in args["env"]:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapper(env)
        # assert env.action_space == spaces.Discrete(7)
    elif args["env_kind"] == "mario":
        env = make_mario_env()
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapperMario(env)
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == "robopong":
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), "%.2i" % rank))
    return env
Example #3
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        if args["stickyAtari"]:  # 在智能体执行动作时增加随机性
            env._max_episode_steps = args['max_episode_steps'] * 4
            env = StickyActionEnv(env)
        else:
            env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)  # 每个动作连续执行4步
        env = ProcessFrame84(env, crop=False)  # 处理观测
        env = FrameStack(env, 4)  # 将连续4帧叠加起来作为输入
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if not args["stickyAtari"]:
            env = ExtraTimeLimit(env,
                                 args['max_episode_steps'])  # 限制了一个周期的最大时间步
        if 'Montezuma' in args['env']:  # 记录智能体的位置, 所在的房间, 已经访问的房间
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':  # 超级马里奥
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":  # 多智能体游戏, Multi-Pong
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Example #4
0
def make_env_all_params(rank, add_monitor, args, logdir):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()
    elif args["env_kind"] == "dm_suite":
        env = make_dm_suite(task=args["env"],
                            logdir=logdir,
                            to_record=args["to_record"])

    if add_monitor:
        env = TempMonitor(env)

    return env
Example #5
0
def make_env_all_params(rank, add_monitor, args, sleep_multiple=2):
    if args["env_kind"] == 'ObstacleTowerEnv':
        env = _make_obs_env(rank, add_monitor, args, sleep_multiple)
    elif args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if rank == 2:
            env = RenderWrapper(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        logdir = osp.join('summaries', args["exp_name"])
        logger.configure(logdir)
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if args["env_kind"] == 'atari' and add_monitor:
        #env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
        env = Monitor(env,
                      os.path.join(os.getcwd(), 'test_video'),
                      force=True,
                      video_callable=lambda episode_id: episode_id % 20 == 0)
        #env = Monitor(env, os.path.join(os.getcwd(), 'test_video'),video_callable=lambda episode_id: True )#,force=True)

    return env
Example #7
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'field':
        import gym_fieldedmove
        env = gym.make('FieldedMove-v0')
        # env = FrameStack(env, 4)
    elif args["env_kind"] == "ple":
        import gym_ple
        env = gym.make(args['env'])
        env._max_episode_steps = args['max_episode_steps']
        # env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Example #8
0
def evaluate_in_environment(net):
    env = gym.make("Pong-v0")
    env = MaxAndSkipEnv(env, skip=4)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=True)
    evaluate(net, env=env)
Example #9
0
 def make_env_all_params(rank, add_monitor=True):
     env = gym.make("MontezumaRevengeNoFrameskip-v4")
     assert 'NoFrameskip' in env.spec.id
     env._max_episode_steps = 4500 * 4
     env = StickyActionEnv(env)
     env = MaxAndSkipEnv(env, skip=4)  # 每个动作连续执行4步
     env = ProcessFrame84(env, crop=False)  # 处理观测
     env = FrameStack(env, 4)  # 将连续4帧叠加起来作为输入
     return env
Example #10
0
def make_env_all_params(rank, args):
    env = gym.make(GAME_NAME)
    env = NoopResetEnv(env, noop_max=NOOP_MAX)
    env = MaxAndSkipEnv(env, skip=4)
    env = ProcessFrame84(env, crop=False)
    env = FrameStack(env, 4)
    # env = ExtraTimeLimit(env,10000)
    env = AddRandomStateToInfo(env)
    env = Monitor(
        env,
        os.path.join(
            'C:/Users/Elias/OneDrive/Winfo Studium/SS19/Masterarbeit/logs',
            '%.2i' % rank))
    return env
Example #11
0
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'robopong':
        if args["env"] == "pong":
            env = make_robo_pong()
        elif args["env"] == "hockey":
            env = make_robo_hockey()

    if add_monitor:
        #print(osp.join(logger.get_dir(), '%.2i' % rank + '.monitor.csv'))

        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
        """
        env = DummyVecEnv([lambda: env])
        
        env = VecVideoRecorder(env, directory = './vid',
                       record_video_trigger=lambda step: step == 0,
                       video_length= 100,)
        
        env.reset()
        """
        #env = wrappers.Monitor(env,'./vid/',force = True,write_upon_reset = True, video_callable=lambda episode: True)
        #print(osp.join(logger.get_dir()))
        #env = Monitor(env, osp.join(logger.get_dir()))
        #env = Monitor(env,  "./vid", video_callable=lambda episode_id: True,force=True)
    return env
    def make_atari_env(self, args):
        """
        duplicated code hack due to
        relative import errors
        """

        env = gym.make(args["env"])
        assert "NoFrameskip" in env.spec.id
        # from self-supervised exploration via disagreement
        if args["stickyAtari"] == "true":
            env = StickyActionEnv(env)
        env._max_episode_steps = args["max_episode_steps"] * 4
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStackNoLazy(env, 4)
        env = ExtraTimeLimit(env, args["max_episode_steps"])
        if "Montezuma" in args["env"]:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
        if args["noisy_tv"] == "true":
            env = NoisyTVEnvWrapper(env)
        return env
def make_env_all_params(rank, add_monitor, args):
    if args["env_kind"] == 'atari':
        env = gym.make(args['env'])
        assert 'NoFrameskip' in env.spec.id
        if args["stickyAtari"]:
            env._max_episode_steps = args['max_episode_steps'] * 4
            env = StickyActionEnv(env)
        else:
            env = NoopResetEnv(env, noop_max=args['noop_max'])
        env = MaxAndSkipEnv(env, skip=4)
        env = ProcessFrame84(env, crop=False)
        env = FrameStack(env, 4)
        if not args["stickyAtari"]:
            env = ExtraTimeLimit(env, args['max_episode_steps'])
        if 'Montezuma' in args['env']:
            env = MontezumaInfoWrapper(env)
        env = AddRandomStateToInfo(env)
    elif args["env_kind"] == 'mario':
        env = make_mario_env()
    elif args["env_kind"] == "retro_multi":
        env = make_multi_pong()
    elif args["env_kind"] == 'unity':
        env = make_unity_maze(args["env"],
                              seed=args["seed"],
                              rank=rank,
                              ext_coeff=args["ext_coeff"],
                              recordUnityVid=args['recordUnityVid'],
                              expID=args["unityExpID"],
                              startLoc=args["startLoc"],
                              door=args["door"],
                              tv=args["tv"],
                              testenv=args["testenv"],
                              logdir=logger.get_dir())

    if add_monitor:
        env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank))
    return env
Example #14
0
def main():
    #Make OpenAI gym environment + wrappers
    date_time = now.strftime("_%H:%M:%S_%m-%d-%Y")
    env = gym.make("PongNoFrameskip-v4")
    env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time)
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)  #skip 4 frames & max over last_obs
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)  #obs shape = num_channels x width x height
    obs_space_shape = env.observation_space.shape[0]
    action_space_shape = env.action_space.n

    #Set random seeds
    seed = 6582
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    #Initialize Replay Memory (Line 1)
    replay_memory = ReplayMemory(max_size=100000)

    #Make Q-Network and Target Q-Network (Lines 2 & 3)
    qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device)
    target_qnet = Atari_Dueling_DQN(obs_space_shape,
                                    action_space_shape).to(device)
    target_qnet.load_state_dict(qnet.state_dict())

    #Training Parameters (Changes from Mnih et al. outlined in README.md)
    optimizer = optim.Adam(qnet.parameters())
    num_frames = 1400000
    gamma = 0.99
    replay_start_size = 50000
    target_network_update_freq = 10000

    #Train
    obs = env.reset()
    num_episodes = 0
    for t in range(1, num_frames + 1):
        epsilon = epsilon_at_t(t)

        #-------------------------------------------------------------------
        #Take one step in the environment & add to Replay Memory (Line 7-11)
        #-------------------------------------------------------------------
        torch.set_grad_enabled(False)
        #Select action with epsilon-greedy exploration (Line 7,8)
        if random.random() > epsilon:
            ts_obs = torch.from_numpy(obs.astype(
                np.float32)).unsqueeze(0).to(device)
            ts_qvals = qnet(ts_obs)
            action = ts_qvals.max(-1)[1].item()
        else:
            action = random.randrange(action_space_shape)
        torch.set_grad_enabled(True)

        #Execute action and get reward + next_obs (Line 9, 10)
        next_obs, reward, done, _ = env.step(action)

        #Store transition in Replay Memory
        replay_memory.add(obs, next_obs, action, reward, done)

        obs = next_obs

        if done:
            obs = env.reset()
            num_episodes += 1

        #Populate Replay Memory with <replay_start_size> experiences before learning
        if t > replay_start_size:
            #---------------------------------------------
            #Sample batch & compute loss & update network (Lines 12 - 15)
            #---------------------------------------------
            obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample(
            )

            ts_obs, ts_rewards, ts_next_obs, ts_done = map(
                lambda x: torch.FloatTensor(x).to(device), [
                    obs_minibatch, rewards_minibatch, next_obs_minibatch,
                    done_minibatch
                ])
            ts_actions = torch.LongTensor(actions_minibatch).to(device)

            torch.set_grad_enabled(False)
            # Compute Target Values (as per Double-DQN update rule)
            ts_next_qvals_outer = qnet(
                ts_next_obs)  #(32, 2) (outer Qnet, evaluates value)
            ts_next_qvals_inner = target_qnet(
                ts_next_obs)  #(32, 2) (inner Qnet, evaluates action)
            ts_next_action_inner = ts_next_qvals_inner.argmax(
                -1, keepdim=True)  #(32, 1)
            ts_next_action_qvals_outer = ts_next_qvals_outer.gather(
                -1, ts_next_action_inner).view(
                    -1)  #(32, ) (use inner actions to evaluate outer Q values)
            ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * (
                1 - ts_done)
            torch.set_grad_enabled(True)

            #Compute predicted
            ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1)  #(32,)

            #Calculate Loss & Perform gradient descent (Line 14)
            loss = F.smooth_l1_loss(ts_pred_q, ts_target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #Update target network ever <target_network_update_freq> steps (Line 15)
            if t % target_network_update_freq == 0:
                target_qnet.load_state_dict(qnet.state_dict())

        #Log to Terminal
        episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards()
        print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward',
              np.mean(episode_rewards[-100:]))
    env.env.close()
    ActionLoggingWrapper,
)


def get_car_mask(frames, car_color=np.array([223, 183, 85])):
    mask = np.zeros(shape=frames[0].shape)
    for a_frame in frames:
        for i in range(a_frame.shape[0]):
            for j in range(a_frame.shape[1]):
                if np.array_equal(a_frame[i][j], car_color):
                    mask[i][j] += 1
    return mask


env = gym.make("BankHeistNoFrameskip-v4")
env = gym.wrappers.Monitor(env, "./video/", force=True)
env._max_episode_steps = 4000 * 4
env = MaxAndSkipEnv(env, skip=4)
env = ProcessFrame84(env, crop=False)
env = FrameStack(env, 4)
env = ExtraTimeLimit(env, 4000)
env = AddRandomStateToInfo(env)

obs = env.reset()

for _ in range(100):
    obs, reward, done, info = env.step(env.action_space.sample())
    import pdb

    pdb.set_trace()