コード例 #1
0
ファイル: envs.py プロジェクト: msgerasyov/phi_gcn
    def _thunk():
        env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if log_dir is not None:
            env = Monitor(env,
                          os.path.join(log_dir, str(rank)),
                          allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
コード例 #2
0
ファイル: runAtari.py プロジェクト: Osj1614/RLstudy
def main():
    env_name = 'BreakoutNoFrameskip-v4'
    env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                       episode_life=True,
                                       clip_rewards=True,
                                       frame_stack=True,
                                       scale=True)
    output_size = env.action_space.n
    input_shape = env.observation_space.shape

    with tf.Session() as sess:
        with tf.variable_scope('Breakout_lr'):
            input = tf.placeholder(tf.float32, [None, *input_shape])

            model = PPO(sess,
                        input,
                        models.nature_cnn(input),
                        actiontype.Discrete,
                        output_size,
                        learning_rate=lambda f: 2.5e-4 * (1 - f),
                        epochs=4,
                        minibatch_size=4,
                        gamma=0.99,
                        beta2=0.01,
                        name='Breakout_lr')
        train(sess,
              model,
              env_name,
              1e7,
              256,
              log_interval=5,
              num_envs=16,
              atari=True)
        #run_only(sess, model, env, render=True)
        env.close()
コード例 #3
0
    def setup(self):
        main_args = Singleton_arger()['main']
        Singleton_logger.setup(main_args['result_dir'],
                               multi_process=main_args['multi_process'])

        Singleton_evaluator.setup(main_args['env'],
                                  logger=Singleton_logger,
                                  num_episodes=10,
                                  model_dir=main_args['result_dir'],
                                  multi_process=main_args['multi_process'],
                                  visualize=False,
                                  rand_seed=main_args['rand_seed'])

        self.env = wrap_deepmind(make_atari(main_args['env']),
                                 frame_stack=True)

        if main_args['rand_seed'] >= 0:
            self.env.seed(main_args['rand_seed'])

        self.obs_shape = self.env.observation_space.shape
        self.nb_action = self.env.action_space.n
        self.agent = DQN()
        self.agent.setup(self.obs_shape, self.nb_action)
        self.result_dir = main_args['result_dir']
        self.reset()
コード例 #4
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        else:
            env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if log_dir is not None:
            env = bench.Monitor(env, os.path.join(log_dir, str(rank)),
                                allow_early_resets=allow_early_resets)

        if is_atari:
            # env = wrap_deepmind(env)
            env = wrap_carl(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
コード例 #5
0
    def __init__(self, params):
        self.env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari('SeaquestNoFrameskip-v4'), frame_stack=True)
        self.replay_memory_size = params['replay_memory'] if 'replay_memory' in params else 10000
        self.replay_memory = deque([], maxlen=self.replay_memory_size)
        self.n_steps = params['n_steps'] if 'n_steps' in params else 100000
        self.training_start = params['training_start'] if 'training_start' in params else 1000
        self.training_interval = params['training_interval'] if 'training_interval' in params else 3
        self.save_steps = params['save_steps'] if 'save_steps' in params else 50
        self.copy_steps = params['copy_steps'] if 'copy_steps' in params else 25
        self.discount_rate = params['discount_rate'] if 'discount_rate' in params else 0.95
        self.skip_start = params['skip_start'] if 'skip_start' in params else 90
        self.batch_size = params['batch_size'] if 'batch_size' in params else 50
        self.iteration = params['iteration'] if 'iteration' in params else 0
        self.n_outputs = params['n_outputs'] if 'n_outputs' in params else self.env.action_space.n
        self.learning_rate = params['learning_rate'] if 'learning_rate' in params else 0.001
        self.global_step = tf.Variable(0, trainable=False, name='global_step')

        self.x = tf.placeholder(tf.float32, shape=[None, 84, 84, 4], name="input_placeholder")
        self.x_action = tf.placeholder(tf.int32, shape=[None], name="x_action")
        self.y = tf.placeholder(tf.float32, [None, 1])

        # setup models, replay memory, and optimizer
        self.actor_q_values, actor_vars = self.dqn_network("q_network/actor")
        critic_q_values, self.critic_vars = self.dqn_network("q_network/critic")
        self.q_value = tf.reduce_sum(critic_q_values * tf.one_hot(self.x_action, self.n_outputs),
                                     axis=1, keep_dims=True)
        copy_ops = [actor_var.assign(self.critic_vars[var_name])
                    for var_name, actor_var in actor_vars.items()]
        self.copy_critic_to_actor = tf.group(*copy_ops)
        self.train_op = self.training_op()
コード例 #6
0
ファイル: run.py プロジェクト: AdamStelmaszczyk/dqn
def main(args):
    assert BATCH_SIZE <= TRAIN_START <= REPLAY_BUFFER_SIZE
    assert TARGET_UPDATE_EVERY % UPDATE_EVERY == 0
    assert 84 % SIDE_BOXES == 0
    assert STRATEGY in ['final', 'future']
    print(args)
    env = make_atari('{}NoFrameskip-v4'.format(args.env))
    set_seed(env, args.seed)
    env_train = wrap_deepmind(env,
                              frame_stack=True,
                              episode_life=True,
                              clip_rewards=True)
    if args.weights:
        model = load_or_create_model(env_train, args.model)
        print_weights(model)
    elif args.debug:
        env, model, target_model, batch = load_for_debug()
        fit_batch(env, model, target_model, batch)
    elif args.play:
        env = wrap_deepmind(env)
        play(env)
    else:
        env_eval = wrap_deepmind(env, frame_stack=True)
        model = load_or_create_model(env_train, args.model)
        if args.view or args.images or args.eval:
            evaluate(env_eval, model, args.view, args.images)
        else:
            max_steps = 100 if args.test else MAX_STEPS
            train(env_train, env_eval, model, max_steps, args.name)
            if args.test:
                filename = save_model(model,
                                      EVAL_STEPS,
                                      logdir='.',
                                      name='test')
                load_or_create_model(env_train, filename)
コード例 #7
0
def initialize_env():

    env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrappers.wrap_deepmind(env,
                                       clip_rewards=False,
                                       frame_stack=True,
                                       pytorch_img=True)
    agent = Agent(in_channels=4, action_size=18, seed=0)

    ####initial network####
    agent.qnetwork_target.load_model(
        torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pth'))
    agent.qnetwork_local.load_model(
        torch.load('./data/dqn_Riverraid_local_model_state_dict.pth'))

    ####initial the buffer replay####
    while len(agent.memory) < BUFFER_INI:
        observation = env.reset()
        done = False
        while not done:
            action = random.sample(range(env.action_space.n), 1)[0]
            next_observation, reward, done, info = env.step(action)
            agent.memory.add(observation, action, reward, next_observation,
                             done)
            observation = next_observation
    print("Replay Buffer Initialized")
    return env, agent
コード例 #8
0
    def __init__(self, agent, env_id, num_envs, timesteps):
        self.agent = agent
        self.num_actions = len(ACTIONS)
        self.num_envs = num_envs
        self.envs = []
        self.timesteps = timesteps

        self.states = np.zeros(shape=[num_envs, timesteps + 1, *INPUT_SHAPE],
                               dtype=np.uint8)
        self.actions = np.zeros(shape=[num_envs, timesteps], dtype=np.uint8)
        self.action_log_probs = np.zeros(shape=[num_envs, timesteps],
                                         dtype=np.float32)
        self.rewards = np.zeros(shape=[num_envs, timesteps], dtype=np.float32)
        self.returns = np.zeros(shape=[num_envs, timesteps], dtype=np.float32)
        self.advantages = np.zeros(shape=[num_envs, timesteps],
                                   dtype=np.float32)
        self.values = np.zeros(shape=[num_envs, timesteps + 1],
                               dtype=np.float32)
        self.news = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.uint8)

        self.last_states = np.zeros([num_envs, *INPUT_SHAPE], dtype=np.uint8)

        self.last_states_new = np.zeros(num_envs, dtype=np.uint8)

        for n in range(num_envs):
            if env_id == "Haxball":
                env = HaxballEnvironment()
            else:
                env = make_atari(env_id)
                env = wrap_deepmind(env, frame_stack=True, scale=False)

            self.envs.append(env)
            state = env.reset()
            self.last_states[n] = to_pytorch(state)
            self.last_states_new[:] = 1
コード例 #9
0
ファイル: play.py プロジェクト: Sebster10/Pandas-Practice
def main():
    env_id = get_args().env
    env = make_atari(env_id)
    env = wrap_deepmind(env,
                        frame_stack=True,
                        clip_rewards=False,
                        episode_life=True)
    env = Monitor(env)
    # rewards will appear higher than during training since rewards are not clipped

    agent = get_agent(env)

    # check for save path
    save_path = os.path.join('models', env_id + '.save')
    agent.load(save_path)

    obs = env.reset()
    renders = []
    while True:
        obs = np.expand_dims(obs.__array__(), axis=0)
        a, v = agent.step(obs)
        obs, reward, done, info = env.step(a)
        env.render()
        if done:
            print(info)
            env.reset()
コード例 #10
0
ファイル: model.py プロジェクト: ling9601/RL
def DEBUG_time():
    env = make_atari(GAME)
    env = wrap_deepmind(env,
                        episode_life=EPISODE_LIFE,
                        clip_rewards=CLIP_REWARDS,
                        frame_stack=FRAME_STACK,
                        scale=SCALE)
    np.random.seed(1)
    env.seed(0)

    agent = Agent('cuda')
    env.reset()

    transaction_list = [
        Transition(state=env.observation_space.sample(),
                   action=0,
                   state_=env.observation_space.sample(),
                   reward=0) for i in range(32)
    ]

    batch = Transition(*zip(*transaction_list))

    time_1 = time.time()
    print("len: {}".format(len(batch.state)))
    for i in range(1000):
        agent._state2tensor(batch.state)
    print("time: {}".format(time.time() - time_1))
コード例 #11
0
def worker(env_name, pipe, atari=False):
    if atari:
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                           frame_stack=True,
                                           scale=True)
    else:
        env = gym.make(env_name)
    s = env.reset()
    reward = 0
    done = False
    try:
        while True:
            pipe.send((s, reward, done))
            cmd, data = pipe.recv()
            if cmd == 'step':
                if isinstance(env.action_space, Box):
                    data = np.clip(data, env.action_space.low,
                                   env.action_space.high)
                s, reward, done, _ = env.step(data)
            else:
                break
            if done:
                s = env.reset()
    finally:
        pipe.close()
        env.close()
コード例 #12
0
 def _thunk():
     env = make_atari(env_id, max_episode_steps=max_episode_steps)
     env.seed(seed + rank)
     env = Monitor(env,
                   logger.get_dir()
                   and os.path.join(logger.get_dir(), str(rank)),
                   allow_early_resets=True)
     return wrap_deepmind(env, **wrapper_kwargs)
コード例 #13
0
def create_deepmind_env(flags):
    return atari_wrappers.wrap_pytorch(
        atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(flags.env),
            clip_rewards=False,
            frame_stack=True,
            scale=False,
        ))
コード例 #14
0
def create_env(flags):
    return wrap_pytorch(
        wrap_deepmind(
            make_atari(flags.env),
            clip_rewards=False,
            frame_stack=True,
            scale=False,
        ))
コード例 #15
0
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
コード例 #16
0
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
コード例 #17
0
 def _thunk():
     env = make_atari(get_args().env)
     env.seed(seed + rank)
     env = wrap_deepmind(env,
                         frame_stack=True,
                         clip_rewards=False,
                         episode_life=False)
     env = Monitor(env, rank)
     return env
コード例 #18
0
def main():
    agent = 'A2C'

    num_envs = 1
    # num_envs = 2
    # num_envs = 4
    # num_envs = 8
    # num_envs = 16

    # env_name = 'PongDeterministic-v4'
    env_name = 'BreakoutDeterministic-v4'
    # env_name = 'SeaquestDeterministic-v4'
    print 'Environment: {0}'.format(env_name)

    envs = [make_atari(env_name) for _ in range(num_envs)]
    # envs = [wrap_deepmind(make_atari(env_name)) for _ in range(num_envs)]
    # envs = [gym.make(env_name) for _ in range(num_envs)]
    for i, env in enumerate(envs):
        env.seed(SEED + i)

    state_dim = envs[0].observation_space.shape
    state_dim = state_dim[0] if len(state_dim) == 1 else state_dim
    # print state_dim

    print str(envs[0].unwrapped.get_action_meanings())

    params = {"arch": agent,
                  "num_episodes": 500000,
                  "max_steps": 100000,
                  "learning_rate": 0.00025,
                  "gamma": 0.99,
                  "beta": 0.01,
                  "lambda": 1.0,
                  "state_dim": 4,
                  "action_dim": envs[0].action_space.n,
                  "print_every": 1,
                  "env_render": not use_cuda,
                  "use_cuda": use_cuda,
                  "use_preproc": True,
                  "resize_shape": (84, 84),
                  "history": 4,
                  "use_luminance": True,
                  'update_freq': 5,
                  # 'update_freq': 50,
                  'action_repeat': 4,
                  'num_envs': num_envs,
                  'save_every': 100,
                  'env_name': env_name,
                  'parallel': True
                  }

    print sorted(params.iteritems())

    # eval_agent(envs[0], params)
    # eval_agent_parallel(envs, params)
    cache_eval_episode(envs[0], params)
コード例 #19
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        help='train an agent to find optimal policy',
                        action='store_true')
    parser.add_argument(
        '--evaluate',
        nargs=1,
        help=
        'evaluates trained policy, pass no of evaluation_episodes as argument',
        type=int)
    parser.add_argument('--play',
                        help='let trained agent play',
                        action='store_true')
    parser.add_argument('--env',
                        nargs=1,
                        help='env used to train or evaluate',
                        type=str)

    args = parser.parse_args()

    env_id = args.env[0]
    env = make_atari(env_id)

    n_actions = env.action_space.n
    agent = DQN(n_actions)

    sess = make_session()

    if (args.train):
        train(agent, env, sess)

    if (args.evaluate):

        test_env = gym.wrappers.Monitor(env,
                                        saveVideoDir + 'testing',
                                        force=True)
        evaluation_reward = evaluate(agent,
                                     test_env,
                                     sess,
                                     restore=True,
                                     eval_episodes=args.evaluate[0])
        open(modelDir + 'accuracy_{}.txt'.format(args.evaluate[0]), 'w').write(
            'Average reward after evaluation of {} episodes is {}'.format(
                args.evaluate[0], round(evaluation_reward, 1)))
        test_env.close()

    if (args.play):
        play_env = gym.wrappers.Monitor(env, saveVideoDir + 'play', force=True)
        evaluate(agent, play_env, sess, restore=True, play=True)
        play_env.close()

    env.close()
コード例 #20
0
def main():
    args = parser.parse_args()
    with tf.Session() as sess:
        # env = gym.make(args.env)
        # initializing atari environment
        env = make_atari(args.env)
        env = wrap_deepmind(env,
                            episode_life=False,
                            clip_rewards=False,
                            frame_stack=True,
                            scale=True)

        rank = MPI.COMM_WORLD.Get_rank()
        workerseed = args.seed + 10000 * rank
        set_global_seeds(workerseed)
        env.seed(workerseed)

        if args.inference:
            inference(
                env,
                sess,
                args.env,
                path_to_model=args.path_to_model,
                embedding_space_size=288,
                joint_training=args.joint_training,
                using_extrinsic_reward=args.using_extrinsic_reward,
            )
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])

            cbf(
                rank,
                env,
                sess,
                args.env,
                args.seed,
                args.debug,
                args.tensorboard,
                args.idf,
                replay_size=1000,
                batch_size=128,
                n_timesteps=args.num_timesteps,
                len_rollouts=256,
                n_optimizations=4,
                embedding_space_size=288,
                learning_rate=1e-5,
                joint_training=args.joint_training,
                using_extrinsic_reward=args.using_extrinsic_reward,
            )
コード例 #21
0
ファイル: train_dqn_ale.py プロジェクト: uidilr/chainerrl
 def make_env(test):
     # Use different random seeds for train and test envs
     env_seed = test_seed if test else train_seed
     env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                        episode_life=not test,
                                        clip_rewards=not test)
     env.seed(int(env_seed))
     if args.monitor:
         env = gym.wrappers.Monitor(
             env, args.outdir, mode='evaluation' if test else 'training')
     if args.render:
         misc.env_modifiers.make_rendered(env)
     return env
コード例 #22
0
ファイル: policy_dist.py プロジェクト: ling9601/my_research
def test():
    """ test distillation and evaluation """
    LEARNING_RATE = 0.0001
    GAME = 'BreakoutNoFrameskip-v4'
    BATCH_SIZE = 32
    EPSILON = 0.05
    ADD_MEM_NUM = 3000
    UPDATE_NUM = 200
    EPOCH = 1
    MEM_SIZE = 50000
    MODEL_PATH = './model/teacher/breakout-1.h5f'
    LOSS_FUC = 'mse'
    EVAL_ITERATION = 3000

    logger = LogWriter(ROOT_PATH, BATCH_SIZE)
    logger.save_setting(args)

    env = make_atari(GAME)
    env = wrap_deepmind(env, frame_stack=True, scale=True)

    teacher = Teacher(MODEL_PATH, env, EPSILON, MEM_SIZE, EVAL_ITERATION)

    student = SingleDtStudent(env, LEARNING_RATE, logger, BATCH_SIZE, EPSILON,
                              teacher, ADD_MEM_NUM, UPDATE_NUM, EPOCH,
                              LOSS_FUC, TARGET_NET_SIZE)

    student.distill()

    logger.save_weights(student, 'student_{}'.format(LOSS_FUC))
    logger.log_total_time_cost()

    # log
    root = 'result_EVAL'
    if not os.path.exists(root):
        os.mkdir(root)
        print('*** Create folder: {} ***'.format(root))
    now_time = time.strftime('%y%m%d_%H%M%S', time.localtime())
    save_path = os.path.join(root, now_time).replace('\\', '/')
    if not os.path.exists(save_path):
        os.mkdir(save_path)
        print('*** Create folder: {} ***'.format(save_path))

    # evaluate teacher
    teacher.evaluate(save_path)

    # evaluate student
    for log_path in glob.glob('./result_DT/*'):
        Evaluator_deprecate(env,
                            log_path,
                            save_path,
                            eval_iteration=EVAL_ITERATION).evaluate()
コード例 #23
0
ファイル: visualization.py プロジェクト: ZhenkaiShou/project
def visualize(file_name):
    # Create folders.
    if not os.path.isdir(FIGURE_VISUALIZATION_DIR):
        os.makedirs(FIGURE_VISUALIZATION_DIR)

    # Obtain environment parameters.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Only build main network for visualization.
    main_network = QValueNetwork(obs_space, action_space, name="main_network")

    obs = env.reset()
    list_obs = []

    with tf.Session() as sess:
        # Load network parameters.
        saver = tf.train.Saver(var_list=main_network.variables)
        saver.restore(sess, SAVE_DIR + file_name)

        done = False
        while True:
            # Get the raw observation.
            raw_obs = env.render(mode="rgb_array")
            list_obs.append(raw_obs)

            env.render()
            # Get action.
            q = sess.run(main_network.q,
                         feed_dict={
                             main_network.Obs:
                             np.expand_dims(np.array(obs) / 255.0, 0)
                         })
            action = np.argmax(q[0])
            # Interact with the environment.
            obs_next, reward, done, _ = env.step(action)
            if done:
                # Get the last raw observation.
                raw_obs = env.render(mode="rgb_array")
                list_obs.append(raw_obs)
                break
            # Update the observation.
            obs = obs_next

    env.close()

    # Record the gameplay.
    imageio.mimsave(FIGURE_VISUALIZATION_DIR + "gameplay.gif",
                    [plot_obs(obs) for obs in list_obs],
                    fps=30)
def main():
    args = parser.parse_args()

    env = make_atari(args.env)
    env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True)
    set_global_seeds(args.seed)
    env.seed(args.seed)

    nA = env.action_space.n

    cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
    directory = 'results/' + cur_time + '_random'
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory_m = 'model/' + cur_time + '_random'
    if not os.path.exists(directory_m):
        os.makedirs(directory_m)

    # For graphing
    best_reward = -float("inf")
    cur_reward = 0
    cur_ep_len = 0
    sum_rewards = 0
    num_episodes = 0
    graph_rewards = []
    graph_epi_lens = []
    graph_avg_rewards = []

    _ = env.reset()
    for t in range(args.num_timesteps):
        if t > 0 and t % int(1e3) == 0:
            print('# frame: %i. Best reward so far: %i.' % (t, best_reward,))
            save_to_file(directory, args.env, graph_rewards, graph_epi_lens, graph_avg_rewards)

        action = np.random.choice(nA)
        _, reward, done, _ = env.step(action)
        cur_reward += reward
        cur_ep_len += 1
        if done:
            graph_epi_lens.append((cur_ep_len,t))
            cur_ep_len = 0
            if cur_reward > best_reward:
                best_reward = cur_reward
            graph_rewards.append((best_reward, t))
            sum_rewards += cur_reward
            num_episodes += 1
            graph_avg_rewards.append((sum_rewards / num_episodes, t))
            cur_reward = 0
            _ = env.reset()

    save_to_file(directory, env_name, graph_rewards, graph_epi_lens, graph_avg_rewards)
コード例 #25
0
ファイル: envs.py プロジェクト: XL2013/Pytorch-DDPG-A2C-PPO
    def _thunk():
        env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        env.seed(seed + rank)
        if is_atari:
            env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = WrapPyTorch(env)
        return env
コード例 #26
0
ファイル: train_nsq_ale.py プロジェクト: stjordanis/chainerrl
 def make_env(process_idx, test):
     # Use different random seeds for train and test envs
     process_seed = process_seeds[process_idx]
     env_seed = 2**31 - 1 - process_seed if test else process_seed
     env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                        episode_life=not test,
                                        clip_rewards=not test)
     env.seed(int(env_seed))
     if test:
         # Randomize actions like epsilon-greedy in evaluation as well
         env = chainerrl.wrappers.RandomizeAction(env, 0.05)
     if args.monitor:
         env = gym.wrappers.Monitor(
             env, args.outdir, mode='evaluation' if test else 'training')
     if args.render:
         misc.env_modifiers.make_rendered(env)
     return env
コード例 #27
0
ファイル: model.py プロジェクト: ling9601/RL
def DEBUG():
    # auto fire after reset,skip_frame=4,stack_frame=4,max_frame operation,scale operation,clipreward operation,episode_life,
    env = make_atari(GAME)
    env = wrap_deepmind(env,
                        episode_life=EPISODE_LIFE,
                        clip_rewards=CLIP_REWARDS,
                        frame_stack=FRAME_STACK,
                        scale=SCALE)

    env.reset()

    for i in range(100):
        img, reward, done, _ = env.step(0)  # img shape (84,84,4)
        img = np.array(img).transpose((2, 0, 1))[0]
        cv2.imshow('1', img)
        cv2.waitKey(0)
        if (done):
            break
コード例 #28
0
def make_atari_env(env_id, seed, name, horizon=None, allow_early_resets=False):
    """Create a wrapped, monitored gym.Env for Atari"""
    assert_admissibility(env_id)
    from atari_wrappers import make_atari, wrap_deepmind
    env = make_atari(env_id)
    if horizon is not None:
        # Override the default episode horizon
        # by hacking the private attribute of the `TimeLimit` wrapped env
        env._max_episode_steps = horizon
    # Wrap the `env` with `Monitor`
    env = Monitor(env,
                  logger.get_dir() and osp.join(logger.get_dir(), name),
                  allow_early_resets=allow_early_resets)
    env.seed(seed)
    # Wrap (second wrapper) with DeepMind's wrapper
    env = wrap_deepmind(env)
    env.seed(seed)
    return env
コード例 #29
0
def demo(num_episode=1):
    eps = 0.01
    env_raw = make_atari(args.env_name)
    env = wrap_deepmind(env_raw)
    c, h, w = m.fp(env.reset()).shape
    n_actions = env.action_space.n
    policy_net = m.DQN(h, w, n_actions, device).to(device)
    if device == "cuda":
        policy_net.load_state_dict(
            torch.load("models/" +
                       args.env_name.replace("NoFrameskip-v4", "") +
                       "_best.pth"))
    else:
        policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\
            "_best.pth", map_location=torch.device('cpu')))
    policy_net.eval()
    sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device)
    q = deque(maxlen=5)
    e_rewards = []
    for eee in range(num_episode):
        print("Demo episode %d/%d" % (eee + 1, num_episode) + "...")
        env.reset()
        e_reward = 0
        for _ in range(5):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            if num_episode <= 1:
                env.render()
                time.sleep(0.02)
            state = torch.cat(list(q))[1:].unsqueeze(0)
            action, eps = sa.select_action(state, False)
            n_frame, reward, done, _ = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)
            e_reward += reward

        e_rewards.append(e_reward)
    avg_reward = float(sum(e_rewards)) / float(num_episode)
    env.close()
    print("Average reward of " + args.env_name + " is %.1f" % (avg_reward))
    print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
コード例 #30
0
ファイル: visualization.py プロジェクト: ZhenkaiShou/project
def visualize(env_name, file_name, network_type):
    # Create folders.
    if not os.path.isdir(FIGURE_VISUALIZATION_DIR):
        os.makedirs(FIGURE_VISUALIZATION_DIR)

    # Obtain environment parameters.
    env = make_atari(env_name)
    obs_shape = env.observation_space.shape
    num_action = env.action_space.n

    # Build model graph.
    model_graph = ModelGraph(obs_shape, num_action, network_type=network_type)

    # Initialize session and load variables.
    sess = tf.InteractiveSession()
    model_graph.load(SAVE_DIR + file_name)

    obs = env.reset()
    list_obs = []

    while True:
        # Get the raw observation.
        raw_obs = env.render(mode="rgb_array")
        list_obs.append(raw_obs)

        env.render()
        # Get action.
        action = model_graph.act(np.expand_dims(np.array(obs), 0))
        # Interact with the environment.
        obs_next, reward, done, _ = env.step(action)
        if done:
            # Get the last raw observation.
            raw_obs = env.render(mode="rgb_array")
            list_obs.append(raw_obs)
            break
        # Update the observation.
        obs = obs_next

    env.close()

    # Record the gameplay.
    imageio.mimsave(FIGURE_VISUALIZATION_DIR + "gameplay.gif",
                    [plot_obs(obs) for obs in list_obs],
                    fps=30)
コード例 #31
0
def eval(weight_file):
    q_policy = Q_Network()
    q_policy.load_state_dict(torch.load(weight_file, map_location='cpu'))
    q_policy.eval()

    env = make_atari(ENV)
    env = wrap_deepmind(env, frame_stack=True)

    observation = env.reset()
    done = False
    while not done:
        tmp_obs = torch.Tensor(observation).unsqueeze(0).permute(0, 3, 1, 2)
        action = q_policy.sampling_action(tmp_obs, 0.1)
        print(action)
        observation_new, reward, done, info = env.step(action)
        time.sleep(1)
        env.render()

    env.close()