def test_atari_preprocessing_scale(env_fn): # arbitrarily chosen number for stepping into env. and ensuring all observations are in the required range max_test_steps = 10 for grayscale in [True, False]: for scaled in [True, False]: env = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=grayscale, scale_obs=scaled, frame_skip=1, noop_max=0) obs = env.reset().flatten() done, step_i = False, 0 max_obs = 1 if scaled else 255 assert (0 <= obs).all() and (obs <= max_obs).all( ), 'Obs. must be in range [0,{}]'.format(max_obs) while not done or step_i <= max_test_steps: obs, _, done, _ = env.step(env.action_space.sample()) obs = obs.flatten() assert (0 <= obs).all() and (obs <= max_obs).all( ), 'Obs. must be in range [0,{}]'.format(max_obs) step_i += 1 env.close()
def test_atari_preprocessing_grayscale(env_fn): import cv2 env1 = env_fn() env2 = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=True, frame_skip=1, noop_max=0) env3 = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=False, frame_skip=1, noop_max=0) env1.seed(0) env2.seed(0) env3.seed(0) obs1 = env1.reset() obs2 = env2.reset() obs3 = env3.reset() assert obs1.shape == (210, 160, 3) assert obs2.shape == (84, 84) assert obs3.shape == (84, 84, 3) assert np.allclose( obs3, cv2.resize(obs1, (84, 84), interpolation=cv2.INTER_AREA)) obs3_gray = cv2.cvtColor(obs3, cv2.COLOR_RGB2GRAY) # the edges of the numbers do not render quite the same in the grayscale, so we ignore them assert np.allclose(obs2[10:38], obs3_gray[10:38]) # the paddle also do not render quite the same assert np.allclose(obs2[44:], obs3_gray[44:]) env1.close() env2.close() env3.close()
def test_atari_preprocessing_grayscale(env_fn): import cv2 env1 = env_fn() env2 = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=True, frame_skip=1, noop_max=0) env3 = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=False, frame_skip=1, noop_max=0) env1.reset() # take these steps to imitate actions of FireReset logic env1.step(1) obs1 = env1.step(2)[0] obs2 = env2.reset() obs3 = env3.reset() assert obs1.shape == (210, 160, 3) assert obs2.shape == (84, 84) assert obs3.shape == (84, 84, 3) np.testing.assert_allclose( obs3, cv2.resize(obs1, (84, 84), interpolation=cv2.INTER_AREA)) obs3_gray = cv2.cvtColor(obs3, cv2.COLOR_RGB2GRAY) # the edges of the numbers do not render quite the same in the grayscale, so we ignore them np.testing.assert_allclose(obs2[10:], obs3_gray[10:]) env1.close() env2.close() env3.close()
for n_epi in range(100): s = env.reset() done = False while not done: for t in range(T_horizon): env.render() s = np.array(s).reshape(shape) od = model(torch.from_numpy(s).float()) prob = od['pi'] #print(prob) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) trn = (s.reshape(shape0), a, r / 100.0, np.array(s_prime), prob[0][a].item(), done) model.put_data(trn) s = s_prime score += r if done: break model.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def main(args): env = gym.make(args.env) # Rescale images to 42x42 and turn into greyscale env = AtariPreprocessing(env, screen_size=42, grayscale_obs=True, noop_max=1, terminal_on_life_loss=True) # A quick trick to give agent some sense of history/motion: # Give N successive frames instead of just one to the agent. # This deque will store N last frames to do this. state_stacker = deque(maxlen=FRAME_STACK_SIZE) new_deque = deque(maxlen=100) # Build models according to image shape and number of actions # that are available. # If we are evaluating, load existing model instead state_shape = RESOLUTION + (FRAME_STACK_SIZE, ) model = None target_model = None if not args.evaluate: # Construct new models model, target_model = build_models(state_shape, env.action_space.n) else: # Load existing model model = keras.models.load_model(args.model_path) # Initialize replay memory (if training) replay_memory = None if not args.evaluate: replay_memory = ReplayMemory(REPLAY_SIZE, state_shape) # Open log file if we want to output results log_file = None if args.log is not None: log_file = open(args.log, "w") # Main training loop step_ctr = 0 q_values_counter = 0 q_values_summation = 0 while step_ctr < args.steps: terminal = False episode_reward = 0 # Keep track of losses losses = [] # Reset frame stacker to empty frames state_stacker.clear() for i in range(FRAME_STACK_SIZE): state_stacker.append(np.zeros(RESOLUTION + (1, ))) s1 = env.reset() # Preprocess state s1 = preprocess_state(s1, state_stacker) while not terminal: action, q_values = get_action(s1, model, env.action_space.n) # TODO # Here you might want to store q_values somewhere # for later plotting s2, reward, terminal, info = env.step(action) #print(reward) s2 = preprocess_state(s2, state_stacker) step_ctr += 1 # Count episodic reward episode_reward += reward if args.show: env.render() # Skip training/replay memory stuff if we are evaluating if not args.evaluate: # Store the experience to replay memory replay_memory.add_experience(s1, action, reward, s2, terminal) # Check if we should do updates or saving model if (step_ctr % UPDATE_RATE) == 0: if replay_memory.num_total > SAMPLES_TILL_TRAIN: losses.append( update_model(model, target_model, replay_memory)) if (step_ctr % TARGET_UPDATE_RATE) == 0: update_target_model(model, target_model) if (step_ctr % SAVE_MODEL_EVERY_STEPS) == 0: model.save(args.model_path) # s2 becomes s1 for the next iteration s1 = s2 # If we want to limit fps, sleep little bit if args.limit_fps: sleep(1 / 35.0) # storing another collection #storer_deque = [] new_deque.append(episode_reward) # To avoid div-by-zero if len(losses) == 0: losses.append(0.0) # TODO # 1) Print out average training loss # 2) Track average reward over last 100 episodes # 3) Track average Q-value of this episode print('Average of q_values: ', np.average(q_values)) # TODO average loss # Losses from previous episodes are already stored in list `losses`. # Compute average loss and include it in the printout below q_values_counter += len(q_values) q_values_summation += np.sum(q_values) print('Average of losses: ', np.average(losses)) print('Average of first 100 revolts: ', np.average(new_deque)) running_average_q_values = q_values_summation / q_values_counter print('Running average of the q_values: ', running_average_q_values) # Legend: # - Episode reward: Reward from the previous episode # - Steps: Total number of agent steps taken in thins training s = "Episode reward: {:.1f}\tSteps: {}\t".format( episode_reward, step_ctr, ) # Print our log message print(s) # If we have a log file, print it there as well if log_file is not None: log_file.write(s + "\n") env.close()