def run(self): """ Run the agent to see it work """ from gym.wrappers import Monitor env = Monitor(self.env, './video', force=True) state = env.reset() reward_sum = 0 episode_number = 0 while episode_number < 2: # forward the policy network and sample an action from the returned probability aprob, h = policy_forward(state) action = 0 if np.random.uniform( ) < aprob else 1 # randomly take 1 of two actions. we are sampling from a bernoulli distribution here # step the environment and get new measurements state, reward, done, info = env.step(action) reward_sum += reward env.render() if done: # an episode finished episode_number += 1 print("Episode finished with total reward", reward_sum) reward_sum = 0 state = env.reset() # reset env
def run_video_agent(model, eps=500): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") env = gym.make('BipedalWalker-v3') env = Monitor(env, './vid', video_callable=lambda episode_id: True, force=True) obs = env.reset() last_obs = obs fitness = 0.0 for _ in range(eps): env.render() obs = torch.from_numpy(obs).float().to(device) action = (model(obs).detach()).cpu().numpy() new_obs, reward, done, info = env.step(action) fitness += reward obs = new_obs if done: break env.close() print("Best score ", fitness)
def simulate(env, agent, deterministic=True, num_episodes=3, render=True, wait_after_render=1e-3, render_kwargs=None, record_video=False): render_kwargs = render_kwargs or dict() assert env.max_episode_steps > 0 if record_video: env = Monitor(env, directory='./data') episode_info = [] for _ in range(num_episodes): obs = env.reset() agent.reset() done = False episode_return = 0 t = 0 while not done: if render: env.render(**render_kwargs) time.sleep(wait_after_render) with torch.no_grad(): action = agent.act(obs, deterministic) obs, reward, done, _ = env.step(action) episode_return += reward t += 1 episode_info.append((t, episode_return)) return episode_info
def evaluate(agent, env, n_episodes=5, render=False, record=False): total_rewards = [] if record: env = Monitor(env, './videos/', force=True) for episode in range(n_episodes): obs = env.reset() obs = obs_reshape(obs) total_reward = 0.0 episode_length = 0 done = False while not done: action = agent.act(obs.reshape(1, *obs.shape)) next_obs, reward, done, _ = env.step(action[0]) next_obs = obs_reshape(next_obs) obs = next_obs total_reward += reward episode_length += 1 if render: env.render() total_rewards.append(total_reward) # print(f">> episode = {episode + 1} / {n_episodes}, total_reward = {total_reward:10.4f}, episode_length = {episode_length}") if render: env.close() return np.mean(total_rewards)
def play(agent_dir, num_episodes, max_episode_steps, save_videos): agent = get_agent(gin.query_parameter("train.agent"))(make_env_fn( gin.query_parameter("train.env_id"), episode_time_limit=max_episode_steps)) agent.pretrain_setup(gin.query_parameter("train.total_timesteps")) ckpt_path = tf.train.latest_checkpoint( os.path.join(agent_dir, "best-weights")) checkpoint = tf.train.Checkpoint(agent) checkpoint.restore( ckpt_path).assert_existing_objects_matched().expect_partial() env = agent.make_env() if save_videos: env = Monitor( env, os.path.join(agent_dir, "monitor"), video_callable=lambda _: True, force=True, ) try: episodes = 0 obs = env.reset() while episodes < num_episodes: action = agent.act(np.expand_dims(obs, 0), deterministic=True).numpy() obs, _, done, _ = env.step(action[0]) env.render() if done: obs = env.reset() episodes += 1 except KeyboardInterrupt: env.close()
def test(model, args, verbose=True): # Initialize environment and model env = Monitor(gym.make(args.env), './recordings', force=True) model.eval() # Initialize variables done, ep_reward = False, [] s = env.reset() hx, cx = init_hidden(1, args.size_hidden) # Generate rollout while not done: # and step < env.spec.timestep_limit: # Render if enabled if args.render: env.render() # Take a step in environment logit, _, _, _ = model.forward(s, hx, cx) prob = F.softmax(logit, dim=-1) action = prob.multinomial(1).data s, r, done, _ = env.step(action.squeeze().numpy()) ep_reward.append(r) if done: break # Close environment and show performance env.close() if verbose is True: print('Test agent achieved a reward of', np.sum(ep_reward))
def enjoy(policy, env, save_path=None, save_video=False, obs_fn=None, nepochs=100): """ Enjoy and flush your result using Monitor class. """ if save_video: assert save_path is not None, 'A path to save videos must be provided!' policy.cuda() policy.eval() if save_video: env = Monitor(env, directory=save_path) for e in range(0, 100): done = False obs = env.reset() episode_rwd = 0 while not done: env.render() if obs_fn is not None: obs = obs_fn(obs) obs = Variable(torch.from_numpy(obs[np.newaxis])).float().cuda() value, action, logprob, mean = policy(obs) action = action.data[0].cpu().numpy() obs, reward, done, _ = env.step(action) episode_rwd += reward print('Episode reward is', episode_rwd)
def main(): """ You can test your game when you finish setting up your environment. Input range from 0 to 5: 0 : South (Down) 1 : North (Up) 2 : East (Right) 3 : West (Left) 4: Pick up 5: Drop off """ GAME = "Assignment1-Taxi-v2" env = gym.make(GAME) n_state = env.observation_space.n n_action = env.action_space.n env = Monitor(env, "taxi_simple", force=True) s = env.reset() steps = 100 for step in range(steps): env.render() action = int(input("Please type in the next action:")) s, r, done, info = env.step(action) print(s) print(r) print(done) print(info) # close environment and monitor env.close()
class Simulation(): def __init__(self, environment="CartPole-v0", save_every=5): env = gym.make(environment) self.env = Monitor( env, './video', video_callable=lambda episode_no: episode_no % save_every == 0, force=True) if environment == "Pong-v0": self.env = wrap_deepmind(env, frame_stack=True, scale=True) self.environment = environment #self.env.seed(0) def reset(self): observation = self.env.reset() if self.environment == "Pong-v0": observation = torch.from_numpy(np.stack(observation)).transpose_( 0, 2).transpose_(1, 2).float().unsqueeze(0) else: observation = torch.from_numpy(observation).float().unsqueeze(0) return observation def step(self, action): observation, reward, is_done, info = self.env.step(action) if self.environment == "Pong-v0": observation = torch.from_numpy(np.stack(observation)).transpose_( 0, 2).transpose_(1, 2).float().unsqueeze(0) else: observation = torch.from_numpy(observation).float().unsqueeze(0) return observation, reward, is_done, info def render(self): self.env.render() def close(self): self.env.close()
class Environment(object): def __init__(self, game, record=False, width=84, height=84, seed=0): self.game = gym.make(game) self.game.seed(seed) if record: self.game = Monitor(self.game, './video', force=True) self.width = width self.height = height self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()]) gym_ple def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() def preprocess(self, screen): preprocessed: np.array = cv2.resize(screen, (self.height, self.width)) # 84 * 84 로 변경 preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114]) # Gray scale 로 변경 # preprocessed: np.array = preprocessed.transpose((2, 0, 1)) # (C, W, H) 로 변경 preprocessed: np.array = preprocessed.astype('float32') / 255. return preprocessed def init(self): """ @return observation """ return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen = self.preprocess(screen) return screen def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): """ :return: observation array """ observation = self.game.reset() observation = self.preprocess(observation) return observation @property def action_space(self): return self.game.action_space.n
def cart_pole_1(): env = gym.make('CartPole-v0') # print('[cart_pole_1]', env.action_space) # Discrete(2) # print('[cart_pole_1]', env.observation_space) # Box(4,) # # action取非负整数0或1。Box表示一个n维的盒子,因此observation是一个4维的数组。我们可以试试box的上下限。 # print('[cart_pole_1]', env.observation_space.high) # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] # print('[cart_pole_1]', env.observation_space.low) # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] env = Monitor(env=env, directory='./tmp/cartpole-experiment-0202', video_callable=False, write_upon_reset=True) observation = env.reset() # 重置环境的状态,返回观察 for t in range(100): env.render() # 重绘环境的一帧 print('[cart_pole_1] observation old:', observation) action = env.action_space.sample() # action = t % 2 print('[cart_pole_1] action', action) observation, reward, done, info = env.step( action) # 推进一个时间步长,返回observation,reward,done,info print('[cart_pole_1] observation new:', observation, '[reward, done, info]:', reward, done, info) if done: print("[observation] Done after {} time steps".format(t + 1)) break env.close()
def play(N=1000): # Change this to 'AssaultNoFrameskip-v4' to play the second game env = wrap_atari_deepmind('BreakoutNoFrameskip-v4', False) env = Monitor(env, directory + "/", force=True) agent.copy(DQN_online[4], sess_o) tot_reward = [] episode = 1 i = 0 while i < N: r = 0 s = env.reset() terminal = False episode_reward = 0 while not terminal: env.render() a = agent.get_action(agent, env, np.array(s)) s_next, r, terminal, dizi = env.step(a) episode_reward += r i = i + 1 s = s_next tot_reward.append(episode_reward) print("Episode reward: ", episode_reward) episode = episode + 1 env.close()
def TestDQNAgent(sess, env, q_value_estimator, state_preprocessor, num_episodes, experiment_dir, record_steps=1): EpisodeStats = namedtuple('Stats', ['episode_lengths', 'episode_rewards']) stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) ckpt_dir = os.path.join(experiment_dir, 'checkpoints') record_path = os.path.join(experiment_dir, 'record/tests/') if not os.path.exists(record_path): os.makedirs(record_path) saver = tf.train.Saver() latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir) if latest_checkpoint: print('\nLoading model checkpoint {}...'.format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) epsilon = 0.1 policy = make_epsilon_greedy_policy(q_value_estimator, len(VALID_ACTIONS)) env = Monitor(env, directory=record_path, video_callable=lambda count: count % record_steps == 0, resume=True) for i_episode in range(num_episodes): state = env.reset() state = state_preprocessor.process(sess, state) state = np.stack([state] * 4, axis=2) for t in itertools.count(): env.render() print("\rStep {} ({}) | Episode {}/{}".format(t, total_t, i_episode + 1, num_episodes), end="") sys.stdout.flush() action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_preprocessor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break state = next_state total_t += 1 episode_stats = EpisodeStats(episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) yield total_t, episode_stats return stats
def main(): finishedTraining = EPISODES startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("ResultsNew.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if (episode + 1) % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + "\n") if ave_reward > 800 and finishedTraining > episode + 300: finishedTraining = episode + 300 elif (episode >= finishedTraining): break results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 env.reset() state = env.env.env.set_test(episode) for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
def main(): """ Orchestrates agent and environment interactions. """ # Create environment environment = gym.make(ENVIRONMENT) if RECORD: environment = Monitor(env=environment, directory=VIDEO_DIRECTORY, video_callable=lambda episode_id: True, force=True) # Set random seeds environment.seed(0) np.random.seed(0) # Get action and state space sizes action_space = environment.action_space.n state_space = environment.observation_space.shape[0] # Instantiate agent agent = Agent(action_space, state_space) # Load model weights if path.exists(CHECKPOINT_DIRECTORY): agent.load(CHECKPOINT_DIRECTORY) # Initialise list of all rewards rewards = [] for episode in range(EPISODES): # Get initial state state = environment.reset() state = np.reshape(state, (1, state_space)) # Reset score for this episode score = 0 for _ in range(STEPS): if RENDER: environment.render() # Agent selects action from state action = agent.act(state) # Agent performs action and makes an observation of the environment next_state, reward, done, _ = agent.observe(environment, action) next_state = np.reshape(next_state, (1, state_space)) observation = (state, action, reward, next_state, done) # Agent remembers parameters of this time step agent.remember(observation) state = next_state # Agent retrains model agent.learn() score += reward if done: print("Episode: {}/{}. Reward: {:.2f}".format( episode + 1, EPISODES, score)) break rewards.append(score) # Average reward over the last 100 episodes average_reward = np.mean(rewards[-100:]) print("Average reward: {:.2f}\n".format(average_reward)) # Terminate environment environment.close() # Save model agent.save(CHECKPOINT_DIRECTORY) # Display performance over time summary(rewards)
def main(path, env_name, seed, render, n_test_rollouts=2): set_global_seeds(seed) # initialize environment env = gym.make(env_name) max_action = env.action_space.high # Load policy. save_recording = False if save_recording: saving_vid = '/media/flowers/3C3C66F13C66A59C/data_save/gym_recording/ddpg_cheetah_drop/' + weight_file[: -5] env = Monitor(env, saving_vid, force=True) # env.directory = '/media/flowers/3C3C66F13C66A59C/data_save/gym_recording/ddp_cheetah_drop' with tf.Session() as sess: # init = tf.global_variables_initializer() # sess.run(init) policy_file = glob.glob(path + '*.meta')[0] saver = tf.train.import_meta_graph(policy_file) saver.restore(sess, tf.train.latest_checkpoint(path)) graph = tf.get_default_graph() obs0 = graph.get_tensor_by_name("obs0:0") actor_tf = graph.get_tensor_by_name("actor/Tanh:0") score = np.zeros([n_test_rollouts]) successes = [] for i in range(n_test_rollouts): done = False obs = env.reset() actions = [] rewards = [] observations = [] while not done: inpt = obs feed_dict = {obs0: [inpt]} action = sess.run(actor_tf, feed_dict=feed_dict) actions.append(action) if render: env.render() new_obs, r, done, info = env.step(action.flatten() * max_action) observations.append(new_obs) rewards.append(r) obs = new_obs if 'is_success' in info.keys(): successes.append(info['is_success']) score[i] = sum(rewards) success_rate = np.mean(successes) print('Success rate = %f' % success_rate) print(score.max()) print(score.min())
def run(episodes=1): env = gym.make('obstacle-v0') env = Monitor(env, 'out', force=True) for _ in range(episodes): env.reset() env.unwrapped.automatic_rendering_callback = env.video_recorder.capture_frame # Capture in-between frames done = False while not done: action = env.unwrapped.dynamics.desired_action observation, reward, done, info = env.step(action) env.render() env.close()
def test(): env = gym.make(args.env) act = deepq.load(os.path.join(args.log_dir, args.log_fname)) if args.record: env = Monitor(env, directory=args.log_dir) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render(mode='test') obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = DQN(env) for episode in range(EPISODE): # initialize task state = env.reset() # Train for step in range(STEP): action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) # Define reward for agent reward_agent = -1 if done else 0.1 agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Test every 100 episodes if episode % 100 == 0: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(STEP): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: %f,Evaluation Average Reward:%f' % (episode, ave_reward)) # if ave_reward >= 200: # break # save results for uploading # env.monitor.start('gym_results/CartPole-v0-experiment-1',force = True) env = Monitor(env, 'gym_results/CartPole-v0-experiment-1', force=True) for i in range(100): state = env.reset() for j in range(200): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break env.close()
def validation(env): env = make_env(env) env = Monitor(env, './video',force=True) sonic = SonicAgent(env,TIMESTEPS_PER_EPISODE* EPISODES, True) sonic.load_model('sonic_model_final.h5') obs = env.reset() while True: action = sonic.policy(obs) #action = random.choice([a for a in range(env.action_space.n)]) next_obs, reward, done, info = env.step(action) print("Para la accion #{} la recompensa es {}".format(action, reward)) env.render() obs = next_obs if done: obs = env.close()
def test(): env = envs.make(args.env, render = bool(args.render), record = bool(args.record)) act = simple.load(os.path.join(args.log_dir, args.log_fname)) if args.record: env = Monitor(env, directory=args.log_dir) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if args.render: env.render() time.sleep(0.05) obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def test(): from baselines0.deepq.utils import BatchInput import json learning_prop = json.load( open(os.path.join(args.log_dir, 'learning_prop.json'), 'r')) env = make_atari(args.env) env = models.wrap_atari_dqn(env) observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) model = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[learning_prop['num_units']] * learning_prop['num_layers'], dueling=bool(args.dueling), init_mean=args.init_mean, init_sd=args.init_sd, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': model, 'scope': learning_prop['scope'], 'eps': args.test_eps } act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params) if args.record: env = Monitor(env, directory=args.log_dir) episode_rew = 0 t = 0 while True: obs, done = env.reset(), False while (not done): if args.render: env.render() time.sleep(0.05) obs, rew, done, info = env.step(act(obs[None])[0]) # Reset only the enviornment but not the recorder if args.record and done: obs, done = env.env.reset(), False episode_rew += rew t += 1 if info['ale.lives'] == 0: print("Episode reward %.2f after %d steps" % (episode_rew, t)) episode_rew = 0 t = 0
def record_play(model, env): env = Monitor(env, './video', force=True) total_reward = 0 state = env.reset() while True: action = model.choose_action(state) next_state, reward, done, _ = env.step(action) env.render() time.sleep(0.03) total_reward += reward state = next_state if done: return total_reward
def test(): env = make_atari(args.env) env = deepq.wrap_atari_dqn(env) act = deepq.load(os.path.join(args.log_dir, args.log_fname)) if args.record: env = Monitor(env, directory=args.log_dir) while True: obs, done = env.reset(), False episode_rew = 0 t = 0 while not done: if not(args.record): env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew t += 1 print("Episode reward %.2f after %d steps"%(episode_rew, t))
def visualize(env, net_params): print('Testing....\n') display = Display(visible=0, size=(1400, 900)) env = Monitor(env, './video', force=True) nn = create_nn(net_params) display.start() state = env.reset() for _ in range(CONFIG['ep_max_steps']): env.render() action = get_action(nn, state) state, _, done, _ = env.step(action) if done: break else: env.stats_recorder.save_complete() env.stats_recorder.done = True env.close() display.stop()
def recording(recording_env, recording_agent, weight_name): env = Monitor(recording_env, './video_%s'%(weight_name), force=True) # watch an trained agent window = [] n_epsiode = 10 for _ in range(n_epsiode): # add Recording tigger state = env.reset() total_score = 0 for j in range(MAX_STEP): state = state/255.0 action, _, _ = recording_agent.act(state, test=True) env.render() state, reward, done, _ = env.step(action) total_score += reward if done: break window.append(total_score) print('Total score for this episode {:.4f}'.format(total_score)) print('Avg score {}'.format(np.mean(window)))
def main(): GAME = "Assignment1-Taxi-v2" env = gym.make(GAME) n_state = env.observation_space.n n_action = env.action_space.n env = Monitor(env, "taxi_simple", force=True) s = env.reset() steps = 100 for step in range(steps): env.render() action = int(input("Please type in the next action:")) s, r, done, info = env.step(action) print(s) print(r) print(done) print(info) env.close()
def main(): env = gym.make(ENV_NAME) agent = Agent(num_actions=env.action_space.n) if TRAIN: # Train mode for _ in range(NUM_EPISODES): terminal = False observation = env.reset() for _ in range(random.randint(1, NO_OP_STEPS)): last_observation = observation observation, _, _, _ = env.step(0) # Do nothing state = agent.get_initial_state(observation, last_observation) while not terminal: last_observation = observation action = agent.get_action(state) observation, reward, terminal, _ = env.step(action) # env.render() processed_observation = preprocess(observation, last_observation) state = agent.run(state, action, reward, terminal, processed_observation) else: # Test mode # env.monitor.start(ENV_NAME + '-test') env = Monitor(env, './SpaceInvaders-1', force=True) for _ in range(NUM_EPISODES_AT_TEST): terminal = False observation = env.reset() for _ in range(random.randint(1, NO_OP_STEPS)): last_observation = observation observation, _, _, _ = env.step(0) # Do nothing state = agent.get_initial_state(observation, last_observation) while not terminal: last_observation = observation action = agent.get_action_at_test(state) observation, _, terminal, _ = env.step(action) env.render() processed_observation = preprocess(observation, last_observation) state = np.append(state[1:, :, :], processed_observation, axis=0)
def test(): from baselines0.deepq.utils import BatchInput env = make_atari(args.env) env = deepq.wrap_atari_dqn(env) observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[args.num_units] * args.num_layers, dueling=bool(args.dueling), ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': model, 'scope': args.scope } act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params) if args.record: env = Monitor(env, directory=args.log_dir) episode_rew = 0 t = 0 while True: obs, done = env.reset(), False while not done: if not (args.record): env.render() #time.sleep(0.01) obs, rew, done, info = env.step(act(obs[None])[0]) episode_rew += rew t += 1 if info['ale.lives'] == 0: print("Episode reward %.2f after %d steps" % (episode_rew, t)) episode_rew = 0 t = 0
def run_random_agent(env_name='CartPole-v0'): env = Monitor(gym.make(env_name), './video') for i_episode in range(1): observation = env.reset() for t in range(100): env.render() print('at t', t) print('\t observation:', observation) print('\t action space:', env.action_space) # 4 actions: do nothing, left, center, right action = env.action_space.sample() if action == 1 or action == 3: # don't fire side engines action = 0 observation, reward, done, info = env.step(action) print('\t action', action) print('\t reward', reward) if done: print("Episode finished after {} timesteps".format(t + 1)) break env.close()