def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def main(): env_name = 'BreakoutNoFrameskip-v4' env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), episode_life=True, clip_rewards=True, frame_stack=True, scale=True) output_size = env.action_space.n input_shape = env.observation_space.shape with tf.Session() as sess: with tf.variable_scope('Breakout_lr'): input = tf.placeholder(tf.float32, [None, *input_shape]) model = PPO(sess, input, models.nature_cnn(input), actiontype.Discrete, output_size, learning_rate=lambda f: 2.5e-4 * (1 - f), epochs=4, minibatch_size=4, gamma=0.99, beta2=0.01, name='Breakout_lr') train(sess, model, env_name, 1e7, 256, log_interval=5, num_envs=16, atari=True) #run_only(sess, model, env, render=True) env.close()
def setup(self): main_args = Singleton_arger()['main'] Singleton_logger.setup(main_args['result_dir'], multi_process=main_args['multi_process']) Singleton_evaluator.setup(main_args['env'], logger=Singleton_logger, num_episodes=10, model_dir=main_args['result_dir'], multi_process=main_args['multi_process'], visualize=False, rand_seed=main_args['rand_seed']) self.env = wrap_deepmind(make_atari(main_args['env']), frame_stack=True) if main_args['rand_seed'] >= 0: self.env.seed(main_args['rand_seed']) self.obs_shape = self.env.observation_space.shape self.nb_action = self.env.action_space.n self.agent = DQN() self.agent.setup(self.obs_shape, self.nb_action) self.result_dir = main_args['result_dir'] self.reset()
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: # env = wrap_deepmind(env) env = wrap_carl(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def __init__(self, params): self.env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari('SeaquestNoFrameskip-v4'), frame_stack=True) self.replay_memory_size = params['replay_memory'] if 'replay_memory' in params else 10000 self.replay_memory = deque([], maxlen=self.replay_memory_size) self.n_steps = params['n_steps'] if 'n_steps' in params else 100000 self.training_start = params['training_start'] if 'training_start' in params else 1000 self.training_interval = params['training_interval'] if 'training_interval' in params else 3 self.save_steps = params['save_steps'] if 'save_steps' in params else 50 self.copy_steps = params['copy_steps'] if 'copy_steps' in params else 25 self.discount_rate = params['discount_rate'] if 'discount_rate' in params else 0.95 self.skip_start = params['skip_start'] if 'skip_start' in params else 90 self.batch_size = params['batch_size'] if 'batch_size' in params else 50 self.iteration = params['iteration'] if 'iteration' in params else 0 self.n_outputs = params['n_outputs'] if 'n_outputs' in params else self.env.action_space.n self.learning_rate = params['learning_rate'] if 'learning_rate' in params else 0.001 self.global_step = tf.Variable(0, trainable=False, name='global_step') self.x = tf.placeholder(tf.float32, shape=[None, 84, 84, 4], name="input_placeholder") self.x_action = tf.placeholder(tf.int32, shape=[None], name="x_action") self.y = tf.placeholder(tf.float32, [None, 1]) # setup models, replay memory, and optimizer self.actor_q_values, actor_vars = self.dqn_network("q_network/actor") critic_q_values, self.critic_vars = self.dqn_network("q_network/critic") self.q_value = tf.reduce_sum(critic_q_values * tf.one_hot(self.x_action, self.n_outputs), axis=1, keep_dims=True) copy_ops = [actor_var.assign(self.critic_vars[var_name]) for var_name, actor_var in actor_vars.items()] self.copy_critic_to_actor = tf.group(*copy_ops) self.train_op = self.training_op()
def main(args): assert BATCH_SIZE <= TRAIN_START <= REPLAY_BUFFER_SIZE assert TARGET_UPDATE_EVERY % UPDATE_EVERY == 0 assert 84 % SIDE_BOXES == 0 assert STRATEGY in ['final', 'future'] print(args) env = make_atari('{}NoFrameskip-v4'.format(args.env)) set_seed(env, args.seed) env_train = wrap_deepmind(env, frame_stack=True, episode_life=True, clip_rewards=True) if args.weights: model = load_or_create_model(env_train, args.model) print_weights(model) elif args.debug: env, model, target_model, batch = load_for_debug() fit_batch(env, model, target_model, batch) elif args.play: env = wrap_deepmind(env) play(env) else: env_eval = wrap_deepmind(env, frame_stack=True) model = load_or_create_model(env_train, args.model) if args.view or args.images or args.eval: evaluate(env_eval, model, args.view, args.images) else: max_steps = 100 if args.test else MAX_STEPS train(env_train, env_eval, model, max_steps, args.name) if args.test: filename = save_model(model, EVAL_STEPS, logdir='.', name='test') load_or_create_model(env_train, filename)
def initialize_env(): env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4') env = atari_wrappers.wrap_deepmind(env, clip_rewards=False, frame_stack=True, pytorch_img=True) agent = Agent(in_channels=4, action_size=18, seed=0) ####initial network#### agent.qnetwork_target.load_model( torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pth')) agent.qnetwork_local.load_model( torch.load('./data/dqn_Riverraid_local_model_state_dict.pth')) ####initial the buffer replay#### while len(agent.memory) < BUFFER_INI: observation = env.reset() done = False while not done: action = random.sample(range(env.action_space.n), 1)[0] next_observation, reward, done, info = env.step(action) agent.memory.add(observation, action, reward, next_observation, done) observation = next_observation print("Replay Buffer Initialized") return env, agent
def __init__(self, agent, env_id, num_envs, timesteps): self.agent = agent self.num_actions = len(ACTIONS) self.num_envs = num_envs self.envs = [] self.timesteps = timesteps self.states = np.zeros(shape=[num_envs, timesteps + 1, *INPUT_SHAPE], dtype=np.uint8) self.actions = np.zeros(shape=[num_envs, timesteps], dtype=np.uint8) self.action_log_probs = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.rewards = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.returns = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.advantages = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.values = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.float32) self.news = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.uint8) self.last_states = np.zeros([num_envs, *INPUT_SHAPE], dtype=np.uint8) self.last_states_new = np.zeros(num_envs, dtype=np.uint8) for n in range(num_envs): if env_id == "Haxball": env = HaxballEnvironment() else: env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=True, scale=False) self.envs.append(env) state = env.reset() self.last_states[n] = to_pytorch(state) self.last_states_new[:] = 1
def main(): env_id = get_args().env env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=True) env = Monitor(env) # rewards will appear higher than during training since rewards are not clipped agent = get_agent(env) # check for save path save_path = os.path.join('models', env_id + '.save') agent.load(save_path) obs = env.reset() renders = [] while True: obs = np.expand_dims(obs.__array__(), axis=0) a, v = agent.step(obs) obs, reward, done, info = env.step(a) env.render() if done: print(info) env.reset()
def DEBUG_time(): env = make_atari(GAME) env = wrap_deepmind(env, episode_life=EPISODE_LIFE, clip_rewards=CLIP_REWARDS, frame_stack=FRAME_STACK, scale=SCALE) np.random.seed(1) env.seed(0) agent = Agent('cuda') env.reset() transaction_list = [ Transition(state=env.observation_space.sample(), action=0, state_=env.observation_space.sample(), reward=0) for i in range(32) ] batch = Transition(*zip(*transaction_list)) time_1 = time.time() print("len: {}".format(len(batch.state))) for i in range(1000): agent._state2tensor(batch.state) print("time: {}".format(time.time() - time_1))
def worker(env_name, pipe, atari=False): if atari: env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), frame_stack=True, scale=True) else: env = gym.make(env_name) s = env.reset() reward = 0 done = False try: while True: pipe.send((s, reward, done)) cmd, data = pipe.recv() if cmd == 'step': if isinstance(env.action_space, Box): data = np.clip(data, env.action_space.low, env.action_space.high) s, reward, done, _ = env.step(data) else: break if done: s = env.reset() finally: pipe.close() env.close()
def _thunk(): env = make_atari(env_id, max_episode_steps=max_episode_steps) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) return wrap_deepmind(env, **wrapper_kwargs)
def create_deepmind_env(flags): return atari_wrappers.wrap_pytorch( atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(flags.env), clip_rewards=False, frame_stack=True, scale=False, ))
def create_env(flags): return wrap_pytorch( wrap_deepmind( make_atari(flags.env), clip_rewards=False, frame_stack=True, scale=False, ))
def _thunk(): env = make_atari(env_id) env.seed(SEED + rank) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) # wrap the env one more time for getting total reward env = Monitor(env, rank) return env
def _thunk(): env = make_atari(get_args().env) env.seed(seed + rank) env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False) env = Monitor(env, rank) return env
def main(): agent = 'A2C' num_envs = 1 # num_envs = 2 # num_envs = 4 # num_envs = 8 # num_envs = 16 # env_name = 'PongDeterministic-v4' env_name = 'BreakoutDeterministic-v4' # env_name = 'SeaquestDeterministic-v4' print 'Environment: {0}'.format(env_name) envs = [make_atari(env_name) for _ in range(num_envs)] # envs = [wrap_deepmind(make_atari(env_name)) for _ in range(num_envs)] # envs = [gym.make(env_name) for _ in range(num_envs)] for i, env in enumerate(envs): env.seed(SEED + i) state_dim = envs[0].observation_space.shape state_dim = state_dim[0] if len(state_dim) == 1 else state_dim # print state_dim print str(envs[0].unwrapped.get_action_meanings()) params = {"arch": agent, "num_episodes": 500000, "max_steps": 100000, "learning_rate": 0.00025, "gamma": 0.99, "beta": 0.01, "lambda": 1.0, "state_dim": 4, "action_dim": envs[0].action_space.n, "print_every": 1, "env_render": not use_cuda, "use_cuda": use_cuda, "use_preproc": True, "resize_shape": (84, 84), "history": 4, "use_luminance": True, 'update_freq': 5, # 'update_freq': 50, 'action_repeat': 4, 'num_envs': num_envs, 'save_every': 100, 'env_name': env_name, 'parallel': True } print sorted(params.iteritems()) # eval_agent(envs[0], params) # eval_agent_parallel(envs, params) cache_eval_episode(envs[0], params)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', help='train an agent to find optimal policy', action='store_true') parser.add_argument( '--evaluate', nargs=1, help= 'evaluates trained policy, pass no of evaluation_episodes as argument', type=int) parser.add_argument('--play', help='let trained agent play', action='store_true') parser.add_argument('--env', nargs=1, help='env used to train or evaluate', type=str) args = parser.parse_args() env_id = args.env[0] env = make_atari(env_id) n_actions = env.action_space.n agent = DQN(n_actions) sess = make_session() if (args.train): train(agent, env, sess) if (args.evaluate): test_env = gym.wrappers.Monitor(env, saveVideoDir + 'testing', force=True) evaluation_reward = evaluate(agent, test_env, sess, restore=True, eval_episodes=args.evaluate[0]) open(modelDir + 'accuracy_{}.txt'.format(args.evaluate[0]), 'w').write( 'Average reward after evaluation of {} episodes is {}'.format( args.evaluate[0], round(evaluation_reward, 1))) test_env.close() if (args.play): play_env = gym.wrappers.Monitor(env, saveVideoDir + 'play', force=True) evaluate(agent, play_env, sess, restore=True, play=True) play_env.close() env.close()
def main(): args = parser.parse_args() with tf.Session() as sess: # env = gym.make(args.env) # initializing atari environment env = make_atari(args.env) env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) rank = MPI.COMM_WORLD.Get_rank() workerseed = args.seed + 10000 * rank set_global_seeds(workerseed) env.seed(workerseed) if args.inference: inference( env, sess, args.env, path_to_model=args.path_to_model, embedding_space_size=288, joint_training=args.joint_training, using_extrinsic_reward=args.using_extrinsic_reward, ) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) cbf( rank, env, sess, args.env, args.seed, args.debug, args.tensorboard, args.idf, replay_size=1000, batch_size=128, n_timesteps=args.num_timesteps, len_rollouts=256, n_optimizations=4, embedding_space_size=288, learning_rate=1e-5, joint_training=args.joint_training, using_extrinsic_reward=args.using_extrinsic_reward, )
def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env
def test(): """ test distillation and evaluation """ LEARNING_RATE = 0.0001 GAME = 'BreakoutNoFrameskip-v4' BATCH_SIZE = 32 EPSILON = 0.05 ADD_MEM_NUM = 3000 UPDATE_NUM = 200 EPOCH = 1 MEM_SIZE = 50000 MODEL_PATH = './model/teacher/breakout-1.h5f' LOSS_FUC = 'mse' EVAL_ITERATION = 3000 logger = LogWriter(ROOT_PATH, BATCH_SIZE) logger.save_setting(args) env = make_atari(GAME) env = wrap_deepmind(env, frame_stack=True, scale=True) teacher = Teacher(MODEL_PATH, env, EPSILON, MEM_SIZE, EVAL_ITERATION) student = SingleDtStudent(env, LEARNING_RATE, logger, BATCH_SIZE, EPSILON, teacher, ADD_MEM_NUM, UPDATE_NUM, EPOCH, LOSS_FUC, TARGET_NET_SIZE) student.distill() logger.save_weights(student, 'student_{}'.format(LOSS_FUC)) logger.log_total_time_cost() # log root = 'result_EVAL' if not os.path.exists(root): os.mkdir(root) print('*** Create folder: {} ***'.format(root)) now_time = time.strftime('%y%m%d_%H%M%S', time.localtime()) save_path = os.path.join(root, now_time).replace('\\', '/') if not os.path.exists(save_path): os.mkdir(save_path) print('*** Create folder: {} ***'.format(save_path)) # evaluate teacher teacher.evaluate(save_path) # evaluate student for log_path in glob.glob('./result_DT/*'): Evaluator_deprecate(env, log_path, save_path, eval_iteration=EVAL_ITERATION).evaluate()
def visualize(file_name): # Create folders. if not os.path.isdir(FIGURE_VISUALIZATION_DIR): os.makedirs(FIGURE_VISUALIZATION_DIR) # Obtain environment parameters. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Only build main network for visualization. main_network = QValueNetwork(obs_space, action_space, name="main_network") obs = env.reset() list_obs = [] with tf.Session() as sess: # Load network parameters. saver = tf.train.Saver(var_list=main_network.variables) saver.restore(sess, SAVE_DIR + file_name) done = False while True: # Get the raw observation. raw_obs = env.render(mode="rgb_array") list_obs.append(raw_obs) env.render() # Get action. q = sess.run(main_network.q, feed_dict={ main_network.Obs: np.expand_dims(np.array(obs) / 255.0, 0) }) action = np.argmax(q[0]) # Interact with the environment. obs_next, reward, done, _ = env.step(action) if done: # Get the last raw observation. raw_obs = env.render(mode="rgb_array") list_obs.append(raw_obs) break # Update the observation. obs = obs_next env.close() # Record the gameplay. imageio.mimsave(FIGURE_VISUALIZATION_DIR + "gameplay.gif", [plot_obs(obs) for obs in list_obs], fps=30)
def main(): args = parser.parse_args() env = make_atari(args.env) env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) set_global_seeds(args.seed) env.seed(args.seed) nA = env.action_space.n cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S') directory = 'results/' + cur_time + '_random' if not os.path.exists(directory): os.makedirs(directory) directory_m = 'model/' + cur_time + '_random' if not os.path.exists(directory_m): os.makedirs(directory_m) # For graphing best_reward = -float("inf") cur_reward = 0 cur_ep_len = 0 sum_rewards = 0 num_episodes = 0 graph_rewards = [] graph_epi_lens = [] graph_avg_rewards = [] _ = env.reset() for t in range(args.num_timesteps): if t > 0 and t % int(1e3) == 0: print('# frame: %i. Best reward so far: %i.' % (t, best_reward,)) save_to_file(directory, args.env, graph_rewards, graph_epi_lens, graph_avg_rewards) action = np.random.choice(nA) _, reward, done, _ = env.step(action) cur_reward += reward cur_ep_len += 1 if done: graph_epi_lens.append((cur_ep_len,t)) cur_ep_len = 0 if cur_reward > best_reward: best_reward = cur_reward graph_rewards.append((best_reward, t)) sum_rewards += cur_reward num_episodes += 1 graph_avg_rewards.append((sum_rewards / num_episodes, t)) cur_reward = 0 _ = env.reset() save_to_file(directory, env_name, graph_rewards, graph_epi_lens, graph_avg_rewards)
def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = WrapPyTorch(env) return env
def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env
def DEBUG(): # auto fire after reset,skip_frame=4,stack_frame=4,max_frame operation,scale operation,clipreward operation,episode_life, env = make_atari(GAME) env = wrap_deepmind(env, episode_life=EPISODE_LIFE, clip_rewards=CLIP_REWARDS, frame_stack=FRAME_STACK, scale=SCALE) env.reset() for i in range(100): img, reward, done, _ = env.step(0) # img shape (84,84,4) img = np.array(img).transpose((2, 0, 1))[0] cv2.imshow('1', img) cv2.waitKey(0) if (done): break
def make_atari_env(env_id, seed, name, horizon=None, allow_early_resets=False): """Create a wrapped, monitored gym.Env for Atari""" assert_admissibility(env_id) from atari_wrappers import make_atari, wrap_deepmind env = make_atari(env_id) if horizon is not None: # Override the default episode horizon # by hacking the private attribute of the `TimeLimit` wrapped env env._max_episode_steps = horizon # Wrap the `env` with `Monitor` env = Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), name), allow_early_resets=allow_early_resets) env.seed(seed) # Wrap (second wrapper) with DeepMind's wrapper env = wrap_deepmind(env) env.seed(seed) return env
def demo(num_episode=1): eps = 0.01 env_raw = make_atari(args.env_name) env = wrap_deepmind(env_raw) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n policy_net = m.DQN(h, w, n_actions, device).to(device) if device == "cuda": policy_net.load_state_dict( torch.load("models/" + args.env_name.replace("NoFrameskip-v4", "") + "_best.pth")) else: policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\ "_best.pth", map_location=torch.device('cpu'))) policy_net.eval() sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device) q = deque(maxlen=5) e_rewards = [] for eee in range(num_episode): print("Demo episode %d/%d" % (eee + 1, num_episode) + "...") env.reset() e_reward = 0 for _ in range(5): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: if num_episode <= 1: env.render() time.sleep(0.02) state = torch.cat(list(q))[1:].unsqueeze(0) action, eps = sa.select_action(state, False) n_frame, reward, done, _ = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) avg_reward = float(sum(e_rewards)) / float(num_episode) env.close() print("Average reward of " + args.env_name + " is %.1f" % (avg_reward)) print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
def visualize(env_name, file_name, network_type): # Create folders. if not os.path.isdir(FIGURE_VISUALIZATION_DIR): os.makedirs(FIGURE_VISUALIZATION_DIR) # Obtain environment parameters. env = make_atari(env_name) obs_shape = env.observation_space.shape num_action = env.action_space.n # Build model graph. model_graph = ModelGraph(obs_shape, num_action, network_type=network_type) # Initialize session and load variables. sess = tf.InteractiveSession() model_graph.load(SAVE_DIR + file_name) obs = env.reset() list_obs = [] while True: # Get the raw observation. raw_obs = env.render(mode="rgb_array") list_obs.append(raw_obs) env.render() # Get action. action = model_graph.act(np.expand_dims(np.array(obs), 0)) # Interact with the environment. obs_next, reward, done, _ = env.step(action) if done: # Get the last raw observation. raw_obs = env.render(mode="rgb_array") list_obs.append(raw_obs) break # Update the observation. obs = obs_next env.close() # Record the gameplay. imageio.mimsave(FIGURE_VISUALIZATION_DIR + "gameplay.gif", [plot_obs(obs) for obs in list_obs], fps=30)
def eval(weight_file): q_policy = Q_Network() q_policy.load_state_dict(torch.load(weight_file, map_location='cpu')) q_policy.eval() env = make_atari(ENV) env = wrap_deepmind(env, frame_stack=True) observation = env.reset() done = False while not done: tmp_obs = torch.Tensor(observation).unsqueeze(0).permute(0, 3, 1, 2) action = q_policy.sampling_action(tmp_obs, 0.1) print(action) observation_new, reward, done, info = env.step(action) time.sleep(1) env.render() env.close()