Esempio n. 1
0
File: adfq.py Progetto: coco66/ADFQ
 def __init__(self, env_name, discount, TH, memory_size=None):
     """A base class.
 	Parameters
 	----------
 	env_name : experimental domain name in models.py
 	discount : the discount factor in MDP
 	TH : finite-time horizon (maximum learning steps)
 	memory_size : Experience Replay memory size
     """
     self.env = envs.make(env_name, type='classic_mdp')
     self.discount = discount
     self.states = []
     self.actions = []
     self.rewards = []
     self.np_random, _ = seeding.np_random(None)
     self.test_counts = []
     self.test_rewards = []
     self.Q_err = []
     self.Q_target = np.array(self.env.optQ(self.discount)).astype(
         np.float16)
     self.visits = np.zeros((self.env.snum, self.env.anum))
     self.memory_size = memory_size
     self.replayMem = {(i, j): []
                       for i in range(self.env.snum)
                       for j in range(self.env.anum)}
     if not (TH == None):
         self.env.set_time(TH)
Esempio n. 2
0
def test_model():
    env_name = 'DartHopperPT-v1'
    env = make_parallel(1, env_name, num=2)

    env2 = make(env_name, num=2, stochastic=False)
    batch_size = 30
    horizon = 100

    s = []
    for i in range(batch_size):
        env2.reset()
        s.append(get_state(env2))

    param = get_params(env2)
    params = np.array([param for i in range(batch_size)])
    env2.env.noisy_input = False

    s = np.array(s)
    a = [[env2.action_space.sample() for j in range(horizon)]
         for i in range(batch_size)]
    a = np.array(a)

    for i in range(3):
        obs, _, done, _ = env2.step(a[-1][i])
        if done:
            break

    for i in tqdm.trange(1):
        r, obs, mask = env(params, s, a)
    print(obs[-1][:3])
Esempio n. 3
0
    def __init__(self, make, env_name, num, stochastic_obs, done=True):
        self.env = make(env_name, num)
        self.env.reset()

        # TODO:close noise
        self.env.env.noisy_input = stochastic_obs
        self.done = done
Esempio n. 4
0
def test():
    env = envs.make(args.env,
                    'atari',
                    render=bool(args.render),
                    record=bool(args.record),
                    directory=args.log_dir)
    learning_prop = json.load(
        open(os.path.join(args.log_dir, '../learning_prop.json'), 'r'))
    act_params = {
        'scope':
        "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'],
        'eps': args.test_eps
    }
    act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params)
    episode_rew = 0
    t = 0
    while True:
        obs, done = env.reset(), False
        while (not done):
            if args.render:
                env.render()
                time.sleep(0.05)
            obs, rew, done, info = env.step(act(obs[None])[0])
            # Reset only the enviornment but not the recorder
            if args.record and done:
                obs, done = env.env.reset(), False
            episode_rew += rew
            t += 1
        if info['ale.lives'] == 0:
            print("Episode reward %.2f after %d steps" % (episode_rew, t))
            episode_rew = 0
            t = 0
Esempio n. 5
0
def test():
    import json
    learning_prop = json.load(
        open(os.path.join(args.log_dir, 'learning_prop.json'), 'r'))
    env = envs.make(
        args.env,
        render=bool(args.render),
        record=bool(args.record),
        ros=bool(args.ros),
        map_name=learning_prop['map'],
        num_targets=learning_prop['nb_targets'],
        is_training=False,
    )
    act_params = {'scope': learning_prop['scope'], 'eps': args.test_eps}
    act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if args.render:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Esempio n. 6
0
def test(seed):
    learning_prop = json.load(
        open(os.path.join(args.log_dir, 'learning_prop.json'), 'r'))
    env = envs.make(
        args.env,
        'ma_target_tracking',
        render=bool(args.render),
        record=bool(args.record),
        directory=args.log_dir,
        ros=bool(args.ros),
        map_name=args.map,
        num_agents=args.nb_agents,  #learning_prop['nb_agents'],
        num_targets=learning_prop['nb_targets'],
        is_training=False,
    )

    act_params = {
        'scope':
        "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'],
        'eps': args.test_eps
    }
    act = madeepq.load(os.path.join(args.log_dir, args.log_fname), act_params)

    from baselines0.evaluation import Test
    Eval = Test()
    Eval.test(args, env, act)
Esempio n. 7
0
def run():
    env = envs.make(args.env_name)

    flag_is_train = args.flag_is_train  # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue)
    flag_focus_blue = args.flag_focus_blue  # flag_focus_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red

    if flag_focus_blue:
        train_agent_name = 'blue'
        red_agent = DQN(env.state_dim,
                        env.action_dim,
                        is_train=False,
                        scope='red')
        blue_agent = DQN(env.state_dim,
                         env.action_dim,
                         is_train=flag_is_train,
                         scope='blue')
        alloc.check_scheme(blue_agent.is_train, red_agent.is_train,
                           train_agent_name)
        run_AirCombat_selfPlay(env, blue_agent, red_agent, train_agent_name)
    else:
        train_agent_name = 'red'
        blue_agent = DQN(env.state_dim,
                         env.action_dim,
                         is_train=False,
                         scope='blue')
        red_agent = DQN(env.state_dim,
                        env.action_dim,
                        is_train=flag_is_train,
                        scope='red')
        alloc.check_scheme(blue_agent.is_train, red_agent.is_train,
                           train_agent_name)
        run_AirCombat_selfPlay(env, red_agent, blue_agent, train_agent_name)
Esempio n. 8
0
def run():
    env = envs.make(args.env_name)
    train_agent = DQN2013(env.state_dim,
                          env.action_dim,
                          is_train=True,
                          is_based=False,
                          scope="guidence")
    run_GuidenceEnv(env, train_agent)
Esempio n. 9
0
def test_env():
    env_name = 'DartHopperPT-v1'
    env = make(env_name, num=2)
    #env = gym.make('Walker2d-v2')
    #env.reset()

    for i in tqdm.trange(10000):
        env.step(env.action_space.sample())
Esempio n. 10
0
def main():
    env = envs.make("airCobate")

    flag_is_train = args.flag_is_train  # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue)
    flag_train_blue = args.flag_train_blue  # flag_train_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red

    # todo: 创建多个agent,并传入 interactor的NANU 中
    raise NotImplementedError
Esempio n. 11
0
def train(seed, save_dir):
    logger.configure()
    set_global_seeds(seed)

    save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed)
    os.makedirs(save_dir_0)

    env = envs.make(args.env,
                    'atari',
                    record=bool(args.record),
                    directory=save_dir_0)

    nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None
    with tf.device(args.device):
        with tf.compat.v1.variable_scope('seed_%d' % seed):
            model = deepq.models.cnn_to_mlp(
                convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                hiddens=[256],
                dueling=bool(args.dueling),
            )

            act = deepq.learn(
                env,
                q_func=model,
                lr=args.learning_rate,
                lr_decay_factor=args.learning_rate_decay_factor,
                lr_growth_factor=args.learning_rate_growth_factor,
                max_timesteps=args.nb_train_steps,
                buffer_size=args.buffer_size,
                exploration_fraction=args.eps_fraction,
                exploration_final_eps=args.eps_min,
                train_freq=4,
                print_freq=1000,
                checkpoint_freq=int(args.nb_train_steps / 10),
                learning_starts=args.nb_warmup_steps,
                target_network_update_freq=args.target_update_freq,
                gamma=0.99,
                prioritized_replay=bool(args.prioritized),
                prioritized_replay_alpha=args.prioritized_replay_alpha,
                scope=args.scope,
                double_q=args.double_q,
                epoch_steps=args.nb_epoch_steps,
                eval_logger=Logger(args.env,
                                   'atari',
                                   nb_test_steps=nb_test_steps,
                                   save_dir=save_dir_0,
                                   render=bool(args.render)),
                save_dir=save_dir_0,
                test_eps=args.test_eps,
                gpu_memory=args.gpu_memory,
                render=bool(args.render),
            )
            print("Saving model to model.pkl")
            act.save(os.path.join(save_dir_0, "model.pkl"))
    env.close()
    if args.record == 1:
        env.moviewriter.finish()
Esempio n. 12
0
def train():
    set_global_seeds(args.seed)
    directory = os.path.join(
        args.log_dir,
        '_'.join([args.env,
                  datetime.datetime.now().strftime("%m%d%H%M")]))
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        ValueError("The directory already exists...", directory)
    json.dump(vars(args),
              open(os.path.join(directory, 'learning_prop.json'), 'w'))

    env = envs.make(args.env,
                    render=bool(args.render),
                    record=bool(args.record),
                    dirname=directory)

    with tf.device(args.device):
        model = deepq.models.mlp([args.num_units] * args.num_layers)
        act, records = deepq.learn(
            env,
            q_func=model,
            lr=args.learning_rate,
            lr_decay_factor=args.learning_rate_decay_factor,
            lr_growth_factor=args.learning_rate_growth_factor,
            max_timesteps=args.nb_train_steps,
            buffer_size=args.buffer_size,
            batch_size=args.batch_size,
            exploration_fraction=args.eps_fraction,
            exploration_final_eps=args.eps_min,
            target_network_update_freq=args.target_update_freq,
            print_freq=10,
            checkpoint_freq=int(args.nb_train_steps / 10),
            learning_starts=args.nb_warmup_steps,
            gamma=args.gamma,
            prioritized_replay=bool(args.prioritized),
            prioritized_replay_alpha=args.prioritized_replay_alpha,
            callback=None,  #callback,
            epoch_steps=args.nb_epoch_steps,
            gpu_memory=args.gpu_memory,
            save_dir=directory,
            double_q=args.double_q,
            nb_test_steps=args.nb_test_steps,
            test_eps=args.test_eps,
            render=bool(args.render),
        )
        print("Saving model to model.pkl")
        act.save(os.path.join(directory, "model.pkl"))
    plot(records, directory)
    memo = input("Memo for this experiment?: ")
    f = open(os.path.join(directory, "memo.txt"), 'w')
    f.write(memo)
    f.close()
    if args.record == 1:
        env.moviewriter.finish()
Esempio n. 13
0
def evaluation_maTTenv(act,
                       env_id,
                       eval_type='random',
                       nb_itrs=5,
                       render=False,
                       **kwargs):
    """
    Evaluation for the ttenv environments in a given set of different sampling
    zones. The set of different sampling zones is defined in MATTENV_EVAL_SET.
    """
    if eval_type == 'random':
        params_set = [{}]
    elif eval_type == 'random_zone':
        params_set = MATTENV_EVAL_SET
    elif eval_type == 'fixed':
        params_set = [{'init_pose_list': kwargs['init_pose_list']}]
    elif eval_type == 'fixed_nb':
        if env_id == 'maTracking-v4':
            params_set = MA_EVAL
    else:
        raise ValueError("Wrong evaluation type for ttenv.")

    env = envs.make(env_id,
                    'ma_target_tracking',
                    render=render,
                    is_training=False,
                    **kwargs)
    total_rewards, total_nlogdetcov = [], []
    action_dict = {}
    for params in params_set:
        total_rewards_k, total_nlogdetcov_k = [], []
        for _ in range(nb_itrs):
            obs = env.reset(**params)
            done = {}
            episode_reward, episode_nlogdetcov, t = 0, 0, 0
            while (type(done) is dict):
                if render:
                    env.render()
                for agent_id, a_obs in obs.items():
                    action_dict[agent_id] = act(np.array(a_obs)[None])[0]
                obs, rew, done, info = env.step(action_dict)
                episode_reward += rew['__all__']
                episode_nlogdetcov += info['mean_nlogdetcov']
                t += 1
            total_rewards_k.append(episode_reward)
            total_nlogdetcov_k.append(episode_nlogdetcov)
        total_rewards.append(total_rewards_k)
        total_nlogdetcov.append(total_nlogdetcov_k)
    if render:
        env.close()
    if len(total_rewards) == 1:
        total_rewards = total_rewards[0]
        total_nlogdetcov = total_nlogdetcov[0]
    return np.array(total_rewards,
                    dtype=np.float32), np.array(total_nlogdetcov,
                                                dtype=np.float32)
Esempio n. 14
0
def test(env_id,
         isAtari,
         act_greedy,
         nb_itrs=3,
         nb_test_steps=10000,
         render=False):
    total_rewards = []
    for _ in range(nb_itrs):
        if isAtari:
            from baselines0.common.atari_wrappers import make_atari
            env_new = make_atari(env_id)
            env_new = deepq.wrap_atari_dqn(env_new)
        else:
            env_new = envs.make(env_id, render, figID=1)
        obs = env_new.reset()

        if nb_test_steps is None:
            done_test = False
            episode_reward = 0
            t = 0
            while not done_test:
                action = act_greedy(np.array(obs)[None])[0]
                obs, rew, done, info = env_new.step(action)
                if render:
                    env_new.render(mode='test')
                episode_reward += rew
                t += 1
                if done:
                    obs = env_new.reset()
                    if (isAtari and (info['ale.lives'] == 0)) or (not isAtari):
                        done_test = done
            if render:
                env_new.close()
            total_rewards.append(episode_reward)
        else:
            t = 0
            episodes = []
            episode_reward = 0
            while (t < nb_test_steps):
                action = act_greedy(np.array(obs)[None])[0]
                obs, rew, done, info = env_new.step(action)
                episode_reward += rew
                t += 1
                if done:
                    obs = env_new.reset()
                    if (isAtari and (info['ale.lives'] == 0)) or (not isAtari):
                        episodes.append(episode_reward)
                        episode_reward = 0
            if not (episodes):
                episodes.append(episode_reward)
            total_rewards.append(np.mean(episodes))

    return np.array(total_rewards, dtype=np.float32)
Esempio n. 15
0
def env_creator(env_config):
    env = envs.make(
        'TargetTracking-v0',
        # render = bool(args.render),
        # record = bool(args.record),
        # ros = bool(args.ros),
        # dirname=directory,
        map_name=env_config["map_name"],
        # num_targets=env_config["num_targets"],
        # im_size=args.im_size,
    )
    return env  # return an env instance
Esempio n. 16
0
def evaluation_ttenv(act,
                     env_id,
                     eval_type='random',
                     nb_itrs=5,
                     render=False,
                     **kwargs):
    """
    Evaluation for the ttenv environments in a given set of different sampling
    zones. The set of different sampling zones is defined in TTENV_EVAL_SET.
    """
    from ttenv.metadata import TTENV_EVAL_SET, TTENV_EVAL_MULTI_SET
    if eval_type == 'random':
        params_set = [{}]
    elif eval_type == 'random_zone':
        params_set = TTENV_EVAL_SET if num_targets == 1 else TTENV_EVAL_MULTI_SET
    elif eval_type == 'fixed':
        params_set = [{'init_pose_list': kwargs['init_pose_list']}]
    else:
        raise ValueError("Wrong evaluation type for ttenv.")

    env = envs.make(env_id,
                    'target_tracking',
                    render=render,
                    is_training=False,
                    **kwargs)
    total_rewards, total_nlogdetcov = [], []
    for params in params_set:
        total_rewards_k, total_nlogdetcov_k = [], []
        for _ in range(nb_itrs):
            obs = env.reset(**params)
            done = False
            episode_reward, episode_nlogdetcov, t = 0, 0, 0
            while not done:
                if render:
                    env.render()
                action = act(np.array(obs)[None])[0]
                obs, rew, done, info = env.step(action)
                episode_reward += rew
                episode_nlogdetcov += info['mean_nlogdetcov']
                t += 1
            total_rewards_k.append(episode_reward)
            total_nlogdetcov_k.append(episode_nlogdetcov)
        total_rewards.append(total_rewards_k)
        total_nlogdetcov.append(total_nlogdetcov_k)
    if render:
        env.close()
    if len(total_rewards) == 1:
        total_rewards = total_rewards[0]
        total_nlogdetcov = total_nlogdetcov[0]
    return np.array(total_rewards,
                    dtype=np.float32), np.array(total_nlogdetcov,
                                                dtype=np.float32)
Esempio n. 17
0
def main():
    env = envs.make(args.env,
                    'target_tracking',
                    render=True,
                    directory=args.log_dir,
                    map_name=args.map,
                    num_targets=args.nb_targets,
                    is_training=False,
                    )
    env_core = env
    while( not hasattr(env_core, '_elapsed_steps')):
        env_core = env_core.env
    env_core = env_core.env

    from logger import TTENV_TEST_SET_PUB
    for eval_num in range(len(TTENV_TEST_SET_PUB)):
        print("TTENV_TEST_SET_PUB: Eval Num %d ..."%eval_num)
        init_pose = []
        target_paths = []
        map_info = []
        while(len(init_pose) < args.nb_paths): # test episode
            _, done = env.reset(**TTENV_TEST_SET_PUB[eval_num]), False
            env_core.has_discovered = [1] * args.nb_targets
            proceed = False
            if args.manual_check:
                env.render()
                proceed = ("y" == input("%d, Init Pose Pass? (y/n) "%len(init_pose)))
            if proceed or not(args.manual_check):
                init_pose_k = {'agent':env_core.agent.state,
                                'targets':[env_core.targets[i].state for i in range(args.nb_targets)],
                                'belief_targets':[env_core.belief_targets[i].state for i in range(args.nb_targets)]}
                target_path_t = [[]] * args.nb_targets
                while not done:
                    _, _, done, _ = env.step(env.action_space.sample())
                    if args.render:
                        env.render()
                    for i in range(args.nb_targets):
                        target_path_t[i].append(env_core.targets[i].state)
                proceed = False
                if args.manual_check:
                    env.render()
                    proceed = ("y" == input("%d, Pass? (y/n) "%len(init_pose)))
                if proceed or not(args.manual_check):
                    init_pose.append(init_pose_k)
                    target_paths.append(target_path_t)
                    if args.map == 'dynamic_map':
                        map_info.append({'chosen_idx': env_core.MAP.chosen_idx, 'rot_angs': env_core.MAP.rot_angs })

        np.save(open(os.path.join(args.log_dir,'path_%d.npy'%eval_num), 'wb'), target_paths)
        pickle.dump(init_pose, open(os.path.join(args.log_dir,'init_eval_%d.pkl'%eval_num), 'wb'))
        if args.map == 'dynamic_map':
            pickle.dump(map_info, open(os.path.join(args.log_dir, 'map_info_%d.pkl'%eval_num), 'wb'))
Esempio n. 18
0
    def __init__(self, env_name, num, total=20, num_train=15, max_horizon=15):
        # pass
        self.env_name = env_name
        self.num = num
        self.total = total
        self.num_train = num_train
        self.max_horizon = max_horizon

        self.policy = get_up_network(env_name, num)

        path = f"{env_name}_{num}"

        self.eval_env = make(env_name, num)
        self.data = self._get_data(path)
Esempio n. 19
0
def test():
    env = envs.make(args.env, render = bool(args.render), record = bool(args.record))
    act = simple.load(os.path.join(args.log_dir, args.log_fname))        
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if args.render:
                env.render()
                time.sleep(0.05)
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Esempio n. 20
0
def test(env_id, act, nb_itrs=5, nb_test_steps=10000, render=False, map_name=None, num_targets=1):
    total_rewards, total_eval = [], []
    for _ in range(nb_itrs):
        env_new = envs.make(env_id, render, figID=1, is_training=False, 
                                        map_name=map_name, num_targets=num_targets)
        obs = env_new.reset()
        if nb_test_steps is None:
            done_test = False
            episode_reward, episode_eval = 0, 0
            t = 0
            while not done_test:
                action = act(np.array(obs)[None])[0]
                obs, rew, done, info = env_new.step(action)
                if render:
                    env_new.render()
                episode_reward += rew
                episode_eval += info['test_reward']
                t += 1
                if done:
                    obs = env_new.reset()
                    done_test = done
            if render:
                env_new.close() 
            total_rewards.append(episode_reward)
            total_eval.append(episode_eval)
        else:
            t = 0
            rewards, evals = [], []
            episode_reward, episode_eval = 0, 0
            episode_eval = 0
            while(t < nb_test_steps):
                action = act(np.array(obs)[None])[0]
                obs, rew, done, info = env_new.step(action)
                episode_reward += rew
                episode_eval += info['test_reward']
                t += 1
                if done:
                    obs = env_new.reset()
                    rewards.append(episode_reward)
                    evals.append(episode_eval)
                    episode_reward, episode_eval = 0, 0
            if not(episodes):
                rewards.append(episode_reward)
                evals.append(episode_eval)
            total_rewards.append(np.mean(rewards))
            total_eval.append(np.mean(evals))

    return np.array(total_rewards, dtype=np.float32), np.array(total_eval, dtype=np.float32)
Esempio n. 21
0
def test():
    env = make('DartHopperPT-v1', num=5)
    """
    env.reset()
    for i in tqdm.trange(10000):
        env.step(env.action_space.sample())
        """

    env.reset()
    state = get_state(env)
    for i in tqdm.trange(10000):
        env.reset()
        set_state(env, state)
        state = state + np.random.normal(state.shape)
        env.step(env.action_space.sample())
        state = get_state(env)
Esempio n. 22
0
def train(seed, save_dir):
    set_global_seeds(seed)
    save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed)
    os.makedirs(save_dir_0)

    env = envs.make(args.env, 'classic_control')
    with tf.device(args.device):
        with tf.compat.v1.variable_scope('seed_%d' % seed):
            model = models.mlp([args.num_units] * args.num_layers,
                               init_mean=args.init_mean,
                               init_sd=args.init_sd)
            act = deepadfq.learn(
                env,
                q_func=model,
                lr=args.learning_rate,
                lr_decay_factor=args.learning_rate_decay_factor,
                lr_growth_factor=args.learning_rate_growth_factor,
                max_timesteps=args.nb_train_steps,
                buffer_size=args.buffer_size,
                batch_size=args.batch_size,
                exploration_fraction=args.eps_fraction,
                exploration_final_eps=args.eps_min,
                target_network_update_freq=args.target_update_freq,
                print_freq=args.nb_epoch_steps,
                checkpoint_freq=int(args.nb_train_steps / 5),
                learning_starts=args.nb_warmup_steps,
                gamma=args.gamma,
                prioritized_replay=bool(args.prioritized),
                prioritized_replay_alpha=args.prioritized_replay_alpha,
                callback=None,  #callback,
                alg=args.alg,
                scope=args.scope,
                sdMin=np.sqrt(args.varth),
                noise=args.noise,
                act_policy=args.act_policy,
                epoch_steps=args.nb_epoch_steps,
                eval_logger=Logger(args.env,
                                   'classic_control',
                                   save_dir=save_dir_0,
                                   render=bool(args.render)),
                save_dir=save_dir_0,
                test_eps=args.test_eps,
                gpu_memory=args.gpu_memory,
                render=bool(args.render),
            )
    if args.record == 1:
        env.moviewriter.finish()
Esempio n. 23
0
def test():
    set_global_seeds(args.seed)
    import json
    if args.env == 'TargetTracking-v5':
        import simple_imtracking as simple
    else:
        import simple_tracking as simple

    learning_prop = json.load(
        open(os.path.join(args.log_dir, 'learning_prop.json'), 'r'))
    env = envs.make(
        args.env,
        render=bool(args.render),
        record=bool(args.record),
        ros=bool(args.ros),
        map_name=args.map,
        num_targets=learning_prop['nb_targets'],
        dirname=args.log_dir,
        is_training=True,
        im_size=args.im_size,
    )
    act_params = {'scope': learning_prop['scope'], 'eps': args.test_eps}
    act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params)

    if args.ros_log:
        from envs.target_tracking.ros_wrapper import RosLog
        log = RosLog(num_targets=args.nb_targets,
                     wrapped_num=args.ros + args.render + args.record + 1)
    t = 0
    while (t < args.nb_test_steps):  # test episode
        t += 1
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if args.render:
                env.render()
            if args.ros_log:
                log.log(env)
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
    if args.record:
        env.moviewriter.finish()
    if args.ros_log:
        log.save(args.log_dir)
Esempio n. 24
0
def main():
    env = envs.make(
        args.env,
        'target_tracking',
        render=bool(args.render),
        directory=args.log_dir,
        map_name=args.map,
        num_targets=args.nb_targets,
        is_training=False,
    )
    timelimit_env = env
    while (not hasattr(timelimit_env, '_elapsed_steps')):
        timelimit_env = timelimit_env.env
    init_pose = []

    params = {}
    # This is an example. Please change this if necessary.
    # from logger import TTENV_EVAL_SET
    # params = TTENV_EVAL_SET[0]

    while (len(init_pose) < args.nb_init_pose):  # test episode
        _, done = env.reset(**params), False
        if args.render:
            env.render()
        notes = input("%d, Pass? y/n" % len(init_pose))
        if notes == "y":
            init_pose.append({
                'agent':
                timelimit_env.env.agent.state,
                'targets': [
                    timelimit_env.env.targets[i].state
                    for i in range(args.nb_targets)
                ],
                'belief_targets': [
                    timelimit_env.env.belief_targets[i].state
                    for i in range(args.nb_targets)
                ]
            })

    pickle.dump(
        init_pose,
        open(os.path.join(args.log_dir, 'init_pose_random_1015.pkl'), 'wb'))
Esempio n. 25
0
def main():
    env = envs.make(
        args.env,
        'ma_target_tracking',
        render=bool(args.render),
        directory=args.log_dir,
        map_name=args.map,
        num_agents=args.nb_agents,
        num_targets=args.nb_targets,
        is_training=False,
    )
    timelimit_env = env
    while (not hasattr(timelimit_env, '_elapsed_steps')):
        timelimit_env = timelimit_env.env
    init_pose = []
    while (len(init_pose) < args.nb_init_pose):  # test episode
        obs, done = env.reset(), False
        if args.render:
            env.render()
        notes = input("%d, Pass? y/n" % len(init_pose))
        if notes == "y":
            init_pose.append({
                'agents': [
                    timelimit_env.env.agents[i].state
                    for i in range(args.nb_agents)
                ],
                'targets': [
                    timelimit_env.env.targets[i].state
                    for i in range(args.nb_targets)
                ],
                'belief_targets': [
                    timelimit_env.env.belief_targets[i].state
                    for i in range(args.nb_targets)
                ]
            })

    pickle.dump(
        init_pose,
        open(os.path.join(args.log_dir, 'init_pose_random_1015.pkl'), 'wb'))
Esempio n. 26
0
def test():
    env = envs.make(args.env,
                    'classic_control',
                    render=bool(args.render),
                    record=bool(args.record),
                    directory=args.log_dir)
    learning_prop = json.load(
        open(os.path.join(args.log_dir, '../learning_prop.json'), 'r'))
    act_params = {
        'scope':
        "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'],
        'eps': args.test_eps
    }
    act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if args.render:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Esempio n. 27
0
    def __init__(self, scene, discount, initQ, TH, memory_size):
        """Tabular RL
		Parameters
		----------
		scene : A name of a task you want to test. (See models.py)
		alpha : learning rate of Q-learning
		discount : discount factor in MDP
		initQ : initial Q value (initialize all Q values with the same number)
		TH : finite-time horizon (maximum learning steps)
		memory_size : Experience Replay memory size
		"""
        self.env = envs.make(scene)
        self.discount = discount
        self.states, self.actions, self.rewards = [], [], []
        self.visits = np.zeros((self.env.snum, self.env.anum), dtype=np.int)
        self.np_random, _ = seeding.np_random(None)
        self.test_counts = []
        self.test_rewards = []
        self.dim = (self.env.snum, self.env.anum)
        if initQ is None:
            self.init_params()
        else:
            self.Q = initQ * np.ones(self.dim, dtype=float)
        if hasattr(self.env, 'terminal_states'):
            for ts in self.env.terminal_states:
                self.Q[ts, :] = 0.0
        self.Q_err = []
        self.Q_target = np.array(self.env.optQ(self.discount)).astype(
            np.float16)
        self.memory_size = memory_size
        self.replayMem = {(i, j): []
                          for i in range(self.env.snum)
                          for j in range(self.env.anum)}

        if TH is not None:
            self.env.set_time(TH)
Esempio n. 28
0
def evaluation(act, env_id, env_type, nb_test_steps=None, nb_itrs=5,
                render=False, **kwargs):
    """Evaluate the current model with a semi-greedy action policy.
    Parameters
    -------
    act: ActWrapper
        Wrapper over act function. Action policy for the evaluation.
    env_id: str
        name of an environment. (e.g. CartPole-v0)
    env_type: str
        type of an environment. (e.g. 'atari', 'classic_control', 'target_tracking')
    nb_test_steps: int
        the number of steps for the evaluation at each iteration. If None, it
        evaluates until an episode ends.
    nb_itrs: int
        the number of test iterations.
    render: bool
        display if True.

    Returns
    -------
    total_rewards: np.array with shape=(nb_itrs,)
        cumulative rewards.
    total_nlogdetcov : np.array with shape=(nb_itrs,)
        cumulative negative mean of logdetcov only for a target tracking env.
    """
    total_rewards = []
    env = envs.make(env_id, env_type, render=render, is_training=False, **kwargs)
    for _ in range(nb_itrs):
        obs = env.reset()
        if nb_test_steps is None: # Evaluate until an episode ends.
            done = False
            episode_reward, t = 0, 0
            while not done:
                if render:
                    env.render()
                import pdb;pdb.set_trace()
                action = act(np.array(obs)[None])[0]
                obs, rew, done, info = env.step(action)
                episode_reward += rew
                t += 1
                if done and (env_type=='atari') and (info['ale.lives'] != 0):
                    done = False
            total_rewards.append(episode_reward)
        else:
            t, episode_reward = 0, 0
            episodes = []
            while(t < nb_test_steps):
                if render:
                    env.render()
                action = act(np.array(obs)[None])[0]
                obs, rew, done, info = env.step(action)
                episode_reward += rew
                t += 1
                if done:
                    obs = env.reset()
                    if ((env_type=='atari') and (info['ale.lives'] == 0)) or not(env_type=='atari'):
                        episodes.append(episode_reward)
                        episode_reward = 0
            if not(episodes):
                episodes.append(episode_reward)
            total_rewards.append(np.mean(episodes))

    if render:
        env.close()
    return np.array(total_rewards, dtype=np.float32), None
Esempio n. 29
0
def train(seed, save_dir):
    set_global_seeds(seed)
    save_dir_0 = os.path.join(save_dir, 'seed_%d'%seed)
    os.makedirs(save_dir_0)
    env = envs.make(args.env,
                    'target_tracking',
                    render=bool(args.render),
                    record=bool(args.record),
                    directory=save_dir_0,
                    ros=bool(args.ros),
                    map_name=args.map,
                    num_targets=args.nb_targets,
                    im_size=args.im_size,
                    )
    with tf.device(args.device):
        with tf.compat.v1.variable_scope('seed_%d'%seed):
            hiddens = args.hiddens.split(':')
            hiddens = [int(h) for h in hiddens]
            if args.env == 'TargetTracking-v5':
                model = models.cnn_plus_mlp(
                                convs=[(4, 8, 4), (8, 4, 2)],
                                hiddens= hiddens,
                                dueling=bool(args.dueling),
                                init_mean = args.init_mean,
                                init_sd = args.init_sd,
                                inpt_dim = (args.im_size, args.im_size),
                )
            else:
                model = models.mlp(hiddens, init_mean=args.init_mean, init_sd=args.init_sd)
            act = deepadfq.learn(
                env,
                q_func=model,
                lr=args.learning_rate,
                lr_decay_factor=args.learning_rate_decay_factor,
                lr_growth_factor=args.learning_rate_growth_factor,
                max_timesteps=args.nb_train_steps,
                buffer_size=args.buffer_size,
                batch_size=args.batch_size,
                exploration_fraction=args.eps_fraction,
                exploration_final_eps=args.eps_min,
                target_network_update_freq=args.target_update_freq,
                checkpoint_freq=args.checkpoint_freq,
                learning_starts=args.nb_warmup_steps,
                gamma=args.gamma,
                prioritized_replay=bool(args.prioritized),
                prioritized_replay_alpha=args.prioritized_replay_alpha,
                callback=None,#callback,
                alg=args.alg,
                scope=args.scope,
                sdMin=np.sqrt(args.varth),
                noise=args.noise,
                act_policy=args.act_policy,
                epoch_steps=args.nb_epoch_steps,
                eval_logger=Logger(args.env,
                                env_type='target_tracking',
                                save_dir=save_dir_0,
                                render=bool(args.render),
                                figID=1,
                                ros=bool(args.ros),
                                map_name=args.map,
                                num_targets=args.nb_targets,
                                im_size=args.im_size,
                                eval_type=args.eval_type,
                                init_file_path=args.init_file_path,
                                ),
                save_dir=save_dir_0,
                test_eps=args.test_eps,
                gpu_memory=args.gpu_memory,
                render=(bool(args.render) or bool(args.ros)),
            )
            print("Saving model to model.pkl")
            act.save(os.path.join(save_dir_0, "model.pkl"))
    if args.record == 1:
        env.moviewriter.finish()
Esempio n. 30
0
def test():
    learning_prop = json.load(open(os.path.join(args.log_dir, '../learning_prop.json'),'r'))
    env = envs.make(args.env,
                    'target_tracking',
                    render=bool(args.render),
                    record=bool(args.record),
                    directory=args.log_dir,
                    ros=bool(args.ros),
                    map_name=args.map,
                    num_targets=learning_prop['nb_targets'],
                    im_size=learning_prop['im_size'],
                    is_training=False,
                    )
    timelimit_env = env
    while( not hasattr(timelimit_env, '_elapsed_steps')):
        timelimit_env = timelimit_env.env
    act_params = {'scope': "seed_%d"%learning_prop['seed']+"/"+learning_prop['scope'], 'eps': args.test_eps}
    act = deepadfq.load(os.path.join(args.log_dir, args.log_fname), act_params)

    if args.ros_log:
        from envs.target_tracking.ros_wrapper import RosLog
        ros_log = RosLog(num_targets=args.nb_targets, wrapped_num=args.ros + args.render + args.record + 1)

    ep = 0
    ep_nlogdetcov = ['Episode nLogDetCov']
    time_elapsed = ['Elapsed Time (sec)']
    given_init_pose, test_init_pose = [], []
    # Use a fixed set of initial positions if given.
    if args.init_file_path != '.':
        import pickle
        given_init_pose = pickle.load(open(args.init_file_path, "rb"))

    while(ep < args.nb_test_steps): # test episode
        ep += 1
        episode_rew, nlogdetcov = 0, 0
        obs, done = env.reset(init_pose_list=given_init_pose), False
        test_init_pose.append({'agent':timelimit_env.env.agent.state,
                            'targets':[timelimit_env.env.targets[i].state for i in range(args.nb_targets)],
                            'belief_targets':[timelimit_env.env.belief_targets[i].state for i in range(args.nb_targets)]})
        s_time = time.time()
        while not done:
            if args.render:
                env.render()
            if args.ros_log:
                ros_log.log(env)
            obs, rew, done, info = env.step(act(obs[None])[0])
            episode_rew += rew
            nlogdetcov += info['mean_nlogdetcov']

        time_elapsed.append(time.time() - s_time)
        ep_nlogdetcov.append(nlogdetcov)
        print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f"%(ep, episode_rew, nlogdetcov))

    if args.record :
        env.moviewriter.finish()
    if args.ros_log :
        ros_log.save(args.log_dir)

    import pickle, tabulate
    pickle.dump(test_init_pose, open(os.path.join(args.log_dir,'test_init_pose.pkl'), 'wb'))
    f_result = open(os.path.join(args.log_dir, 'test_result.txt'), 'w')
    f_result.write(tabulate.tabulate([ep_nlogdetcov, time_elapsed], tablefmt='presto'))
    f_result.close()