Exemple #1
0
def main(args):
    assert BATCH_SIZE <= TRAIN_START <= REPLAY_BUFFER_SIZE
    assert TARGET_UPDATE_EVERY % UPDATE_EVERY == 0
    assert 84 % SIDE_BOXES == 0
    assert STRATEGY in ['final', 'future']
    print(args)
    env = make_atari('{}NoFrameskip-v4'.format(args.env))
    set_seed(env, args.seed)
    env_train = wrap_deepmind(env,
                              frame_stack=True,
                              episode_life=True,
                              clip_rewards=True)
    if args.weights:
        model = load_or_create_model(env_train, args.model)
        print_weights(model)
    elif args.debug:
        env, model, target_model, batch = load_for_debug()
        fit_batch(env, model, target_model, batch)
    elif args.play:
        env = wrap_deepmind(env)
        play(env)
    else:
        env_eval = wrap_deepmind(env, frame_stack=True)
        model = load_or_create_model(env_train, args.model)
        if args.view or args.images or args.eval:
            evaluate(env_eval, model, args.view, args.images)
        else:
            max_steps = 100 if args.test else MAX_STEPS
            train(env_train, env_eval, model, max_steps, args.name)
            if args.test:
                filename = save_model(model,
                                      EVAL_STEPS,
                                      logdir='.',
                                      name='test')
                load_or_create_model(env_train, filename)
Exemple #2
0
    def __init__(self, params):
        self.env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari('SeaquestNoFrameskip-v4'), frame_stack=True)
        self.replay_memory_size = params['replay_memory'] if 'replay_memory' in params else 10000
        self.replay_memory = deque([], maxlen=self.replay_memory_size)
        self.n_steps = params['n_steps'] if 'n_steps' in params else 100000
        self.training_start = params['training_start'] if 'training_start' in params else 1000
        self.training_interval = params['training_interval'] if 'training_interval' in params else 3
        self.save_steps = params['save_steps'] if 'save_steps' in params else 50
        self.copy_steps = params['copy_steps'] if 'copy_steps' in params else 25
        self.discount_rate = params['discount_rate'] if 'discount_rate' in params else 0.95
        self.skip_start = params['skip_start'] if 'skip_start' in params else 90
        self.batch_size = params['batch_size'] if 'batch_size' in params else 50
        self.iteration = params['iteration'] if 'iteration' in params else 0
        self.n_outputs = params['n_outputs'] if 'n_outputs' in params else self.env.action_space.n
        self.learning_rate = params['learning_rate'] if 'learning_rate' in params else 0.001
        self.global_step = tf.Variable(0, trainable=False, name='global_step')

        self.x = tf.placeholder(tf.float32, shape=[None, 84, 84, 4], name="input_placeholder")
        self.x_action = tf.placeholder(tf.int32, shape=[None], name="x_action")
        self.y = tf.placeholder(tf.float32, [None, 1])

        # setup models, replay memory, and optimizer
        self.actor_q_values, actor_vars = self.dqn_network("q_network/actor")
        critic_q_values, self.critic_vars = self.dqn_network("q_network/critic")
        self.q_value = tf.reduce_sum(critic_q_values * tf.one_hot(self.x_action, self.n_outputs),
                                     axis=1, keep_dims=True)
        copy_ops = [actor_var.assign(self.critic_vars[var_name])
                    for var_name, actor_var in actor_vars.items()]
        self.copy_critic_to_actor = tf.group(*copy_ops)
        self.train_op = self.training_op()
Exemple #3
0
def initialize_env():

    env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrappers.wrap_deepmind(env,
                                       clip_rewards=False,
                                       frame_stack=True,
                                       pytorch_img=True)
    agent = Agent(in_channels=4, action_size=18, seed=0)

    ####initial network####
    agent.qnetwork_target.load_model(
        torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pth'))
    agent.qnetwork_local.load_model(
        torch.load('./data/dqn_Riverraid_local_model_state_dict.pth'))

    ####initial the buffer replay####
    while len(agent.memory) < BUFFER_INI:
        observation = env.reset()
        done = False
        while not done:
            action = random.sample(range(env.action_space.n), 1)[0]
            next_observation, reward, done, info = env.step(action)
            agent.memory.add(observation, action, reward, next_observation,
                             done)
            observation = next_observation
    print("Replay Buffer Initialized")
    return env, agent
Exemple #4
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        else:
            env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if log_dir is not None:
            env = bench.Monitor(env, os.path.join(log_dir, str(rank)),
                                allow_early_resets=allow_early_resets)

        if is_atari:
            if new_wrapper:
                env = wrap_carl_full(env)
            else:
                env = wrap_deepmind(env)
        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
Exemple #5
0
def evaluate(step,
             policy_net,
             device,
             env,
             n_actions,
             eps=0.05,
             num_episode=5):
    env = wrap_deepmind(env, clip_rewards=True)
    sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device)
    e_rewards = []
    q = deque(maxlen=5)
    for i in range(num_episode):
        env.reset()
        e_reward = 0
        for _ in range(10):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            state = torch.cat(list(q))[1:].unsqueeze(0)
            # print(state.shape)
            action, eps = sa.select_action(state, train)
            n_frame, reward, done, info = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

            e_reward += reward
        e_rewards.append(e_reward)

    f = open("file.txt", 'a')
    f.write("%f, %d, %d\n" %
            (float(sum(e_rewards)) / float(num_episode), step, num_episode))
    f.close()
Exemple #6
0
def worker(env_name, pipe, atari=False):
    if atari:
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                           frame_stack=True,
                                           scale=True)
    else:
        env = gym.make(env_name)
    s = env.reset()
    reward = 0
    done = False
    try:
        while True:
            pipe.send((s, reward, done))
            cmd, data = pipe.recv()
            if cmd == 'step':
                if isinstance(env.action_space, Box):
                    data = np.clip(data, env.action_space.low,
                                   env.action_space.high)
                s, reward, done, _ = env.step(data)
            else:
                break
            if done:
                s = env.reset()
    finally:
        pipe.close()
        env.close()
Exemple #7
0
def evaluate(step, eva_net, env, num_episode=15):
    env = wrap_deepmind(env)
    e_rewards = []
    for i in range(num_episode):
        img = env.reset()
        sum_r = 0
        done = False
        state_buffer = []
        for i in range(5):
            state_buffer.append(img)
        s = state_buffer[1:5]
        while not done:
            a = myDQN.choose_action(s, train=False)

            img, r, done, info = env.step(a)
            sum_r += r
            state_buffer.pop(0)
            state_buffer.append(img)
            s_ = state_buffer[1:5]
            s = s_

        e_rewards.append(sum_r)

    f = open("file.txt", 'a')
    f.write("%f, %d, %d\n" %
            (float(sum(e_rewards)) / float(num_episode), step, num_episode))
    f.close()
Exemple #8
0
    def setup(self):
        main_args = Singleton_arger()['main']
        Singleton_logger.setup(main_args['result_dir'],
                               multi_process=main_args['multi_process'])

        Singleton_evaluator.setup(main_args['env'],
                                  logger=Singleton_logger,
                                  num_episodes=10,
                                  model_dir=main_args['result_dir'],
                                  multi_process=main_args['multi_process'],
                                  visualize=False,
                                  rand_seed=main_args['rand_seed'])

        self.env = wrap_deepmind(make_atari(main_args['env']),
                                 frame_stack=True)

        if main_args['rand_seed'] >= 0:
            self.env.seed(main_args['rand_seed'])

        self.obs_shape = self.env.observation_space.shape
        self.nb_action = self.env.action_space.n
        self.agent = DQN()
        self.agent.setup(self.obs_shape, self.nb_action)
        self.result_dir = main_args['result_dir']
        self.reset()
Exemple #9
0
def DEBUG_time():
    env = make_atari(GAME)
    env = wrap_deepmind(env,
                        episode_life=EPISODE_LIFE,
                        clip_rewards=CLIP_REWARDS,
                        frame_stack=FRAME_STACK,
                        scale=SCALE)
    np.random.seed(1)
    env.seed(0)

    agent = Agent('cuda')
    env.reset()

    transaction_list = [
        Transition(state=env.observation_space.sample(),
                   action=0,
                   state_=env.observation_space.sample(),
                   reward=0) for i in range(32)
    ]

    batch = Transition(*zip(*transaction_list))

    time_1 = time.time()
    print("len: {}".format(len(batch.state)))
    for i in range(1000):
        agent._state2tensor(batch.state)
    print("time: {}".format(time.time() - time_1))
    def __init__(self, agent, env_id, num_envs, timesteps):
        self.agent = agent
        self.num_actions = len(ACTIONS)
        self.num_envs = num_envs
        self.envs = []
        self.timesteps = timesteps

        self.states = np.zeros(shape=[num_envs, timesteps + 1, *INPUT_SHAPE],
                               dtype=np.uint8)
        self.actions = np.zeros(shape=[num_envs, timesteps], dtype=np.uint8)
        self.action_log_probs = np.zeros(shape=[num_envs, timesteps],
                                         dtype=np.float32)
        self.rewards = np.zeros(shape=[num_envs, timesteps], dtype=np.float32)
        self.returns = np.zeros(shape=[num_envs, timesteps], dtype=np.float32)
        self.advantages = np.zeros(shape=[num_envs, timesteps],
                                   dtype=np.float32)
        self.values = np.zeros(shape=[num_envs, timesteps + 1],
                               dtype=np.float32)
        self.news = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.uint8)

        self.last_states = np.zeros([num_envs, *INPUT_SHAPE], dtype=np.uint8)

        self.last_states_new = np.zeros(num_envs, dtype=np.uint8)

        for n in range(num_envs):
            if env_id == "Haxball":
                env = HaxballEnvironment()
            else:
                env = make_atari(env_id)
                env = wrap_deepmind(env, frame_stack=True, scale=False)

            self.envs.append(env)
            state = env.reset()
            self.last_states[n] = to_pytorch(state)
            self.last_states_new[:] = 1
Exemple #11
0
def main():
    env_id = get_args().env
    env = make_atari(env_id)
    env = wrap_deepmind(env,
                        frame_stack=True,
                        clip_rewards=False,
                        episode_life=True)
    env = Monitor(env)
    # rewards will appear higher than during training since rewards are not clipped

    agent = get_agent(env)

    # check for save path
    save_path = os.path.join('models', env_id + '.save')
    agent.load(save_path)

    obs = env.reset()
    renders = []
    while True:
        obs = np.expand_dims(obs.__array__(), axis=0)
        a, v = agent.step(obs)
        obs, reward, done, info = env.step(a)
        env.render()
        if done:
            print(info)
            env.reset()
Exemple #12
0
def main():
    env_name = 'BreakoutNoFrameskip-v4'
    env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                       episode_life=True,
                                       clip_rewards=True,
                                       frame_stack=True,
                                       scale=True)
    output_size = env.action_space.n
    input_shape = env.observation_space.shape

    with tf.Session() as sess:
        with tf.variable_scope('Breakout_lr'):
            input = tf.placeholder(tf.float32, [None, *input_shape])

            model = PPO(sess,
                        input,
                        models.nature_cnn(input),
                        actiontype.Discrete,
                        output_size,
                        learning_rate=lambda f: 2.5e-4 * (1 - f),
                        epochs=4,
                        minibatch_size=4,
                        gamma=0.99,
                        beta2=0.01,
                        name='Breakout_lr')
        train(sess,
              model,
              env_name,
              1e7,
              256,
              log_interval=5,
              num_envs=16,
              atari=True)
        #run_only(sess, model, env, render=True)
        env.close()
Exemple #13
0
    def _thunk():
        env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if log_dir is not None:
            env = Monitor(env,
                          os.path.join(log_dir, str(rank)),
                          allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
Exemple #14
0
def create_deepmind_env(flags):
    return atari_wrappers.wrap_pytorch(
        atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(flags.env),
            clip_rewards=False,
            frame_stack=True,
            scale=False,
        ))
def create_env(flags):
    return wrap_pytorch(
        wrap_deepmind(
            make_atari(flags.env),
            clip_rewards=False,
            frame_stack=True,
            scale=False,
        ))
Exemple #16
0
 def _thunk():
     env = make_atari(env_id, max_episode_steps=max_episode_steps)
     env.seed(seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=True)
     return wrap_deepmind(env, **wrapper_kwargs)
Exemple #17
0
 def _thunk():
     env = make_atari(get_args().env)
     env.seed(seed + rank)
     env = wrap_deepmind(env,
                         frame_stack=True,
                         clip_rewards=False,
                         episode_life=False)
     env = Monitor(env, rank)
     return env
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
def get_env(name, seed):
    env = gym.make(name)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = './tmp/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
    env = wrap_deepmind(env)

    return env
def get_env(env_id, seed):
    """Get gym environment, per id and seed."""
    env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = './tmp/hw3_vid_dir2/'
    env = wrappers.Monitor(env, os.path.join(expt_dir, "gym"), force=True)
    env = wrap_deepmind(env)

    return env
Exemple #22
0
def main():
    args = parser.parse_args()
    with tf.Session() as sess:
        # env = gym.make(args.env)
        # initializing atari environment
        env = make_atari(args.env)
        env = wrap_deepmind(env,
                            episode_life=False,
                            clip_rewards=False,
                            frame_stack=True,
                            scale=True)

        rank = MPI.COMM_WORLD.Get_rank()
        workerseed = args.seed + 10000 * rank
        set_global_seeds(workerseed)
        env.seed(workerseed)

        if args.inference:
            inference(
                env,
                sess,
                args.env,
                path_to_model=args.path_to_model,
                embedding_space_size=288,
                joint_training=args.joint_training,
                using_extrinsic_reward=args.using_extrinsic_reward,
            )
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])

            cbf(
                rank,
                env,
                sess,
                args.env,
                args.seed,
                args.debug,
                args.tensorboard,
                args.idf,
                replay_size=1000,
                batch_size=128,
                n_timesteps=args.num_timesteps,
                len_rollouts=256,
                n_optimizations=4,
                embedding_space_size=288,
                learning_rate=1e-5,
                joint_training=args.joint_training,
                using_extrinsic_reward=args.using_extrinsic_reward,
            )
Exemple #23
0
def get_env(game_name):
    """
    Wraps the environment in a couple of decorators formulated by deep mind, and
    implemented by OpenAi that perform preprocessing.

    :param game_name: The game that will be played.
    :return: The wrapped environment.
    """

    env = gym.make(game_name)
    if game_name == PONG:
        env = wrap_deepmind(env, False, False, True, False)
    return env
Exemple #24
0
 def make_env(test):
     # Use different random seeds for train and test envs
     env_seed = test_seed if test else train_seed
     env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                        episode_life=not test,
                                        clip_rewards=not test)
     env.seed(int(env_seed))
     if args.monitor:
         env = gym.wrappers.Monitor(
             env, args.outdir, mode='evaluation' if test else 'training')
     if args.render:
         misc.env_modifiers.make_rendered(env)
     return env
Exemple #25
0
def evaluate(agent,
             env,
             sess,
             restore=False,
             eval_episodes=eval_episodes,
             play=False):

    if restore:
        saver = tf.compat.v1.train.Saver()
        latestSnapshot = tf.train.latest_checkpoint(modelDir)
        if not latestSnapshot:
            raise Exception('No saved model found in: ' + modelDir)

        saver.restore(sess, latestSnapshot)
        print("Restored saved model from latest snapshot")

    eval_env = wrap_deepmind(env,
                             episode_life=False,
                             clip_rewards=False,
                             frame_stack=True,
                             no_op_reset=True)

    obs = eval_env.reset()
    eval_episode_rewards = [0.0]

    while (True):

        action = sess.run(agent.choose_action,
                          feed_dict={
                              agent.obs_input_ph: np.array(obs)[None, :],
                              agent.epsilon_ph: evaluation_ep
                          })
        next_obs, reward, done, info = eval_env.step(action)
        eval_episode_rewards[-1] += reward
        obs = next_obs

        eval_mean_reward = np.mean(eval_episode_rewards)

        if done:
            obs = eval_env.reset()
            no_of_episodes = len(eval_episode_rewards)
            if (restore):
                print("Mean reward after {} episodes is {}".format(
                    no_of_episodes, round(eval_mean_reward, 2)))
            if (play):
                break
            if (no_of_episodes >= eval_episodes):
                break
            eval_episode_rewards.append(0.0)

    return round(eval_mean_reward, 2)
Exemple #26
0
def test():
    """ test distillation and evaluation """
    LEARNING_RATE = 0.0001
    GAME = 'BreakoutNoFrameskip-v4'
    BATCH_SIZE = 32
    EPSILON = 0.05
    ADD_MEM_NUM = 3000
    UPDATE_NUM = 200
    EPOCH = 1
    MEM_SIZE = 50000
    MODEL_PATH = './model/teacher/breakout-1.h5f'
    LOSS_FUC = 'mse'
    EVAL_ITERATION = 3000

    logger = LogWriter(ROOT_PATH, BATCH_SIZE)
    logger.save_setting(args)

    env = make_atari(GAME)
    env = wrap_deepmind(env, frame_stack=True, scale=True)

    teacher = Teacher(MODEL_PATH, env, EPSILON, MEM_SIZE, EVAL_ITERATION)

    student = SingleDtStudent(env, LEARNING_RATE, logger, BATCH_SIZE, EPSILON,
                              teacher, ADD_MEM_NUM, UPDATE_NUM, EPOCH,
                              LOSS_FUC, TARGET_NET_SIZE)

    student.distill()

    logger.save_weights(student, 'student_{}'.format(LOSS_FUC))
    logger.log_total_time_cost()

    # log
    root = 'result_EVAL'
    if not os.path.exists(root):
        os.mkdir(root)
        print('*** Create folder: {} ***'.format(root))
    now_time = time.strftime('%y%m%d_%H%M%S', time.localtime())
    save_path = os.path.join(root, now_time).replace('\\', '/')
    if not os.path.exists(save_path):
        os.mkdir(save_path)
        print('*** Create folder: {} ***'.format(save_path))

    # evaluate teacher
    teacher.evaluate(save_path)

    # evaluate student
    for log_path in glob.glob('./result_DT/*'):
        Evaluator_deprecate(env,
                            log_path,
                            save_path,
                            eval_iteration=EVAL_ITERATION).evaluate()
def main():
    args = parser.parse_args()

    env = make_atari(args.env)
    env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True)
    set_global_seeds(args.seed)
    env.seed(args.seed)

    nA = env.action_space.n

    cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
    directory = 'results/' + cur_time + '_random'
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory_m = 'model/' + cur_time + '_random'
    if not os.path.exists(directory_m):
        os.makedirs(directory_m)

    # For graphing
    best_reward = -float("inf")
    cur_reward = 0
    cur_ep_len = 0
    sum_rewards = 0
    num_episodes = 0
    graph_rewards = []
    graph_epi_lens = []
    graph_avg_rewards = []

    _ = env.reset()
    for t in range(args.num_timesteps):
        if t > 0 and t % int(1e3) == 0:
            print('# frame: %i. Best reward so far: %i.' % (t, best_reward,))
            save_to_file(directory, args.env, graph_rewards, graph_epi_lens, graph_avg_rewards)

        action = np.random.choice(nA)
        _, reward, done, _ = env.step(action)
        cur_reward += reward
        cur_ep_len += 1
        if done:
            graph_epi_lens.append((cur_ep_len,t))
            cur_ep_len = 0
            if cur_reward > best_reward:
                best_reward = cur_reward
            graph_rewards.append((best_reward, t))
            sum_rewards += cur_reward
            num_episodes += 1
            graph_avg_rewards.append((sum_rewards / num_episodes, t))
            cur_reward = 0
            _ = env.reset()

    save_to_file(directory, env_name, graph_rewards, graph_epi_lens, graph_avg_rewards)
Exemple #28
0
 def __init__(self, env_name, params={}, **kwargs):
     self.load_params(params)
     envs = [gym.make(env_name) for _ in range(self.num_env)]
     envs[0] = gym.wrappers.Monitor(
         envs[0],
         kwargs.get('logdir', DEFAULT_LOGDIR),
         force=True,
         video_callable=lambda t: t % self.video_freq == 0)
     num_frames = self.input_shape[-1]
     envs = [
         wrap_deepmind(env, num_frames, self.end_of_life_penalty)
         for env in envs
     ]
     super().__init__(envs, **kwargs)
Exemple #29
0
    def _thunk():
        env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        env.seed(seed + rank)
        if is_atari:
            env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = WrapPyTorch(env)
        return env
Exemple #30
0
def evaluate(agent,
             env,
             eval_episodes=eval_episodes,
             restore=False,
             play=False):

    if (restore):
        ckpt = tf.train.checkpoint(model=agent.online_model())
        latestSnapshot = tf.train.latest_checkpoint(modelDir)
        if not latestSnapshot:
            raise Exception('No saved model found in: ' + modelDir)

        ckpt.restore(latestSnapshot)
        print("Restored saved model from latest snapshot")

    eval_env = wrap_deepmind(env,
                             episode_life=False,
                             clip_rewards=False,
                             frame_stack=True,
                             evaluate=True)
    obs = eval_env.reset()

    eval_episode_rewards = [0.0]

    while (True):

        action = agent.choose_action(obs=np.array(obs)[None, :],
                                     epsilon=evaluation_ep)
        next_obs, reward, done, info = eval_env.step(action)
        eval_episode_rewards[-1] += reward
        obs = next_obs

        eval_mean_reward = np.mean(eval_episode_rewards)

        if done:
            obs = eval_env.reset()
            no_of_episodes = len(eval_episode_rewards)
            if (restore):
                print("Mean reward after {} episodes is {}".format(
                    no_of_episodes, round(eval_mean_reward, 2)))
            if (play):
                break
            if (no_of_episodes >= eval_episodes):
                break
            eval_episode_rewards.append(0.0)

    return round(eval_mean_reward, 2)
Exemple #31
0
 def make_env(process_idx, test):
     # Use different random seeds for train and test envs
     process_seed = process_seeds[process_idx]
     env_seed = 2**31 - 1 - process_seed if test else process_seed
     env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                        episode_life=not test,
                                        clip_rewards=not test)
     env.seed(int(env_seed))
     if test:
         # Randomize actions like epsilon-greedy in evaluation as well
         env = chainerrl.wrappers.RandomizeAction(env, 0.05)
     if args.monitor:
         env = gym.wrappers.Monitor(
             env, args.outdir, mode='evaluation' if test else 'training')
     if args.render:
         misc.env_modifiers.make_rendered(env)
     return env
Exemple #32
0
def get_env(task):
    env_id = task.env_id
    env = gym.make(env_id)
    env = wrap_deepmind(env)
    return env