def test_atari_preprocessing_scale(env_fn): # arbitrarily chosen number for stepping into env. and ensuring all observations are in the required range max_test_steps = 10 for grayscale in [True, False]: for scaled in [True, False]: env = AtariPreprocessing(env_fn(), screen_size=84, grayscale_obs=grayscale, scale_obs=scaled, frame_skip=1, noop_max=0) obs = env.reset().flatten() done, step_i = False, 0 max_obs = 1 if scaled else 255 assert (0 <= obs).all() and (obs <= max_obs).all( ), 'Obs. must be in range [0,{}]'.format(max_obs) while not done or step_i <= max_test_steps: obs, _, done, _ = env.step(env.action_space.sample()) obs = obs.flatten() assert (0 <= obs).all() and (obs <= max_obs).all( ), 'Obs. must be in range [0,{}]'.format(max_obs) step_i += 1 env.close()
def main(args): env = gym.make(args.env) # Rescale images to 42x42 and turn into greyscale env = AtariPreprocessing(env, screen_size=42, grayscale_obs=True, noop_max=1, terminal_on_life_loss=True) # A quick trick to give agent some sense of history/motion: # Give N successive frames instead of just one to the agent. # This deque will store N last frames to do this. state_stacker = deque(maxlen=FRAME_STACK_SIZE) new_deque = deque(maxlen=100) # Build models according to image shape and number of actions # that are available. # If we are evaluating, load existing model instead state_shape = RESOLUTION + (FRAME_STACK_SIZE, ) model = None target_model = None if not args.evaluate: # Construct new models model, target_model = build_models(state_shape, env.action_space.n) else: # Load existing model model = keras.models.load_model(args.model_path) # Initialize replay memory (if training) replay_memory = None if not args.evaluate: replay_memory = ReplayMemory(REPLAY_SIZE, state_shape) # Open log file if we want to output results log_file = None if args.log is not None: log_file = open(args.log, "w") # Main training loop step_ctr = 0 q_values_counter = 0 q_values_summation = 0 while step_ctr < args.steps: terminal = False episode_reward = 0 # Keep track of losses losses = [] # Reset frame stacker to empty frames state_stacker.clear() for i in range(FRAME_STACK_SIZE): state_stacker.append(np.zeros(RESOLUTION + (1, ))) s1 = env.reset() # Preprocess state s1 = preprocess_state(s1, state_stacker) while not terminal: action, q_values = get_action(s1, model, env.action_space.n) # TODO # Here you might want to store q_values somewhere # for later plotting s2, reward, terminal, info = env.step(action) #print(reward) s2 = preprocess_state(s2, state_stacker) step_ctr += 1 # Count episodic reward episode_reward += reward if args.show: env.render() # Skip training/replay memory stuff if we are evaluating if not args.evaluate: # Store the experience to replay memory replay_memory.add_experience(s1, action, reward, s2, terminal) # Check if we should do updates or saving model if (step_ctr % UPDATE_RATE) == 0: if replay_memory.num_total > SAMPLES_TILL_TRAIN: losses.append( update_model(model, target_model, replay_memory)) if (step_ctr % TARGET_UPDATE_RATE) == 0: update_target_model(model, target_model) if (step_ctr % SAVE_MODEL_EVERY_STEPS) == 0: model.save(args.model_path) # s2 becomes s1 for the next iteration s1 = s2 # If we want to limit fps, sleep little bit if args.limit_fps: sleep(1 / 35.0) # storing another collection #storer_deque = [] new_deque.append(episode_reward) # To avoid div-by-zero if len(losses) == 0: losses.append(0.0) # TODO # 1) Print out average training loss # 2) Track average reward over last 100 episodes # 3) Track average Q-value of this episode print('Average of q_values: ', np.average(q_values)) # TODO average loss # Losses from previous episodes are already stored in list `losses`. # Compute average loss and include it in the printout below q_values_counter += len(q_values) q_values_summation += np.sum(q_values) print('Average of losses: ', np.average(losses)) print('Average of first 100 revolts: ', np.average(new_deque)) running_average_q_values = q_values_summation / q_values_counter print('Running average of the q_values: ', running_average_q_values) # Legend: # - Episode reward: Reward from the previous episode # - Steps: Total number of agent steps taken in thins training s = "Episode reward: {:.1f}\tSteps: {}\t".format( episode_reward, step_ctr, ) # Print our log message print(s) # If we have a log file, print it there as well if log_file is not None: log_file.write(s + "\n") env.close()
print_interval = 3 for n_epi in range(100): s = env.reset() done = False while not done: for t in range(T_horizon): env.render() s = np.array(s).reshape(shape) od = model(torch.from_numpy(s).float()) prob = od['pi'] #print(prob) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) trn = (s.reshape(shape0), a, r / 100.0, np.array(s_prime), prob[0][a].item(), done) model.put_data(trn) s = s_prime score += r if done: break model.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0
args = parser.parse_args() params = json.loads(open(args.checkpoint + "/args.json").read()) env = gym.make(params["env_name"]) if params["atari"]: env = AtariPreprocessing(env) env = FrameStack(env, params["framestack"]) env = Wrapper(env) env = VectorWrapper(env) env = TorchWrapper(env) policy = getattr(sys.modules[__name__], params["policy_name"])( params["framestack"] if params["atari"] else env.state_size, env.action_size, continuous=params["continuous"], stochastic_value=params["sv"], feature_extraction=params["feature_extraction"]) agent = getattr(sys.modules[__name__], params["agent_name"])(env, policy, **params) policy.load_state_dict(torch.load(args.checkpoint + "/model.pth")) while True: obs = env.reset() done = False while not done: action = agent.select_action(obs)[0] obs, rew, done, _ = env.step(action) env.render()
import gym from gym.wrappers import AtariPreprocessing from DuelDDQN_Agent import Agent import numpy as np env = gym.make("BreakoutNoFrameskip-v4") env = AtariPreprocessing(env, grayscale_obs=False) s = env.reset() num_episodes = 10 agent = Agent(3, 4) scores = [] for i in range(num_episodes): state = env.reset() done = False score = 0 while not done: action = agent.choose_action(np.moveaxis(state, -1, 0)) next_state, reward, done, _ = env.step(action) agent.store_transition(np.moveaxis(state, -1, 0), action, reward, np.moveaxis(next_state, -1, 0), done) agent.learn() score += reward scores.append(score) if max(scores) <= score: agent.save_models() print(f"Episode {i}, Score {score}, Epsilon {agent.epsilon}")
class Game(): def __init__(self, game_name, start_noop=2, last_n_frames=4, frameskip=4, grayscale_obs=True, scale_obs=False): self.start_noop = start_noop self.last_n_frames = last_n_frames self.frameskip = frameskip self.buffer = deque([], self.last_n_frames) self.env = gym.make(game_name) # Hacks to make environment deterministic and compatible with Atari Preprocessing self.env.unwrapped.frameskip = 1 if 'NoFrameskip' not in self.env.spec.id: print('Environment is not Frameskip version.') self.env.spec.id += '-NoFrameskip' self.envWrapped = AtariPreprocessing(self.env, frame_skip=self.frameskip, grayscale_obs=grayscale_obs, scale_obs=scale_obs) self.envWrapped.reset() self.n_actions = self.env.action_space.n init_screen = self.get_screen() # Screen dimension is represented as (CHW) for PyTorch self.scr_dims = tuple([self.last_n_frames] + list(init_screen.shape)) for _ in range(self.frameskip): self.buffer.append(init_screen.copy()) #self.start_game() def start_game(self): self.buffer.clear() # Random starting operations to simulate human conditions noop_action = 0 # In breakout, nothing happens unless first 'Fired'. if 'Breakout' in self.env.spec.id: noop_action = 1 for _ in range(random.randint(1, self.start_noop)): # 0 corresponds to No-Op action # 1 corresponds to Fire self.step(noop_action) # Fill remaining buffer by most recent frame to send a valid input to model if len(self.buffer) > 0: last_screen = self.buffer[-1] else: last_screen = self.get_screen() while len(self.buffer) < self.buffer.maxlen: self.buffer.append(last_screen.copy()) def get_screen(self): screen = self.envWrapped._get_obs() return screen def get_input(self): # Each element in buffer is a tensor of 84x84 dimensions. # This function returns tensor of 4x84x84 dimensions. return np.stack(tuple(self.buffer), axis=0) def get_n_actions(self): # return number of actions return self.n_actions def reset_env(self): # reset the gym environment self.env.reset() self.start_game() def get_screen_dims(self): # return the screen dimensions return self.scr_dims def step(self, action): screen, reward, done, _ = self.envWrapped.step(action) # # DEBUG # import matplotlib.pyplot as plt # plt.imshow(screen) # plt.plot() # plt.savefig('tmp_img.png') # print(action, '\t', reward) # input() # # DEBUG # ALE takes care of the max pooling of the last 2 frames # Refer: "https://danieltakeshi.github.io/2016/11/25/ # frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/" self.buffer.append(screen) # reward is clipped between -1 and 1 reward = np.clip(reward, -1.0, 1.0) return reward, done
class GymEnvWrapper(gym.Env): """Wraps an OpenAI Gym environment to be able to modify its dimensions corresponding to MDP Playground. The documentation for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py. Currently supported dimensions: transition noise (discrete) reward delay reward noise Also supports wrapping with AtariPreprocessing from OpenAI Gym or wrap_deepmind from Ray Rllib. """ # Should not be a gym.Wrapper because 1) gym.Wrapper has member variables observation_space and action_space while here with irrelevant_features we would have multiple observation_spaces and this could cause conflict with code that assumes any subclass of gym.Wrapper should have these member variables. # However, it _should_ be at least a gym.Env # Does it need to be a subclass of base_class because some external code # may check if it's an AtariEnv, for instance, and do further stuff based # on that? def __init__(self, env, **config): self.config = copy.deepcopy(config) # self.env = config["env"] self.env = env seed_int = None if "seed" in config: seed_int = config["seed"] self.seed(seed_int) # seed # IMP Move below code from here to seed()? Because if seed is called # during the run of an env, the expectation is that all obs., act. space, # etc. seeds are set? Only Atari in Gym seems to do something similar, the # others I saw there don't seem to set seed for obs., act. spaces. self.env.seed( seed_int ) # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env? obs_space_seed = self.np_random.randint(sys.maxsize) # random act_space_seed = self.np_random.randint(sys.maxsize) # random self.env.observation_space.seed(obs_space_seed) # seed self.env.action_space.seed(act_space_seed) # seed # if "dummy_eval" in config: #hack # del config["dummy_eval"] if "delay" in config: self.delay = config["delay"] assert config["delay"] >= 0 self.reward_buffer = [0.0] * (self.delay) else: self.delay = 0 if "transition_noise" in config: self.transition_noise = config["transition_noise"] if config["state_space_type"] == "continuous": assert callable(self.transition_noise), ( "transition_noise must be a function when env is continuous, it was of type:" + str(type(self.transition_noise))) else: assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, ( "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:" + str(self.transition_noise)) else: if config["state_space_type"] == "discrete": self.transition_noise = 0.0 else: self.transition_noise = lambda a: 0.0 if "reward_noise" in config: if callable(config["reward_noise"]): self.reward_noise = config["reward_noise"] else: reward_noise_std = config["reward_noise"] self.reward_noise = lambda a: a.normal(0, reward_noise_std) else: self.reward_noise = None if ("wrap_deepmind_ray" in config and config["wrap_deepmind_ray"]): # hack ##TODO remove? self.env = wrap_deepmind(self.env, dim=42, framestack=True) elif "atari_preprocessing" in config and config["atari_preprocessing"]: self.frame_skip = 4 # default for AtariPreprocessing if "frame_skip" in config: self.frame_skip = config["frame_skip"] self.grayscale_obs = False if "grayscale_obs" in config: self.grayscale_obs = config["grayscale_obs"] # Use AtariPreprocessing with frame_skip # noop_max set to 1 because we want to keep the vanilla env as # deterministic as possible and setting it 0 was not allowed. ##TODO # noop_max=0 is poosible in new Gym version, so update Gym version. self.env = AtariPreprocessing( self.env, frame_skip=self.frame_skip, grayscale_obs=self.grayscale_obs, noop_max=1, ) print("self.env.noop_max set to: ", self.env.noop_max) if "irrelevant_features" in config: # self.irrelevant_features = config["irrelevant_features"] irr_toy_env_conf = config["irrelevant_features"] if "seed" not in irr_toy_env_conf: irr_toy_env_conf["seed"] = self.np_random.randint( sys.maxsize) # random self.irr_toy_env = RLToyEnv(**irr_toy_env_conf) if config["state_space_type"] == "discrete": self.action_space = Tuple( (self.env.action_space, self.irr_toy_env.action_space)) self.observation_space = Tuple( (self.env.observation_space, self.irr_toy_env.observation_space) ) # TODO for image observations, concatenate to 1 obs. space here and in step() and reset()? else: # TODO Check the test case added for cont. irr features case and code for it in run_experiments.py. env_obs_low = self.env.observation_space.low env_obs_high = self.env.observation_space.high env_obs_dtype = env_obs_low.dtype env_obs_shape = env_obs_low.shape irr_env_obs_low = self.irr_toy_env.observation_space.low irr_env_obs_high = self.irr_toy_env.observation_space.high irr_env_obs_dtype = self.irr_toy_env.observation_space.low.dtype assert env_obs_dtype == irr_env_obs_dtype, ( "Datatypes of base env and irrelevant toy env should match. Were: " + str(env_obs_dtype) + ", " + str(irr_env_obs_dtype)) ext_low = np.concatenate((env_obs_low, irr_env_obs_low)) ext_high = np.concatenate((env_obs_high, irr_env_obs_high)) self.observation_space = Box(low=ext_low, high=ext_high, dtype=env_obs_dtype) env_act_low = self.env.action_space.low env_act_high = self.env.action_space.high env_act_dtype = env_act_low.dtype self.env_act_shape = env_act_low.shape assert (len(self.env_act_shape) == 1 ), "Length of shape of action space should be 1." irr_env_act_low = self.irr_toy_env.action_space.low irr_env_act_high = self.irr_toy_env.action_space.high irr_env_act_dtype = irr_env_act_low.dtype # assert env_obs_dtype == env_act_dtype, "Datatypes of obs. and act. of # base env should match. Were: " + str(env_obs_dtype) + ", " + # str(env_act_dtype) #TODO Apparently, observations are np.float64 and # actions np.float32 for Mujoco. ext_low = np.concatenate((env_act_low, irr_env_act_low)) ext_high = np.concatenate((env_act_high, irr_env_act_high)) self.action_space = Box( low=ext_low, high=ext_high, dtype=env_act_dtype ) # TODO Use BoxExtended here and above? self.observation_space.seed(obs_space_seed) # seed self.action_space.seed(act_space_seed) # seed else: self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.total_episodes = 0 # if "action_loss_weight" in config: #hack # del config["action_loss_weight"] # if "action_space_max" in config: #hack # action_space_max = config["action_space_max"] # del config["action_space_max"] # if "time_unit" in config: #hack # time_unit = config["time_unit"] # del config["time_unit"] # if "dummy_seed" in config: #hack # del config["dummy_seed"] super(GymEnvWrapper, self).__init__() # if "action_space_max" in locals(): # print("Setting Mujoco self.action_space.low, self.action_space.high from:", self.action_space.low, self.action_space.high) # self.action_space.low *= action_space_max # self.action_space.high *= action_space_max # print("to:", self.action_space.low, self.action_space.high) # if base_class == HalfCheetahEnv and action_space_max >= 4: #hack # self.model.opt.timestep /= 2 # 0.005 # self.frame_skip *= 2 # print("Setting Mujoco timestep to", self.model.opt.timestep, "half of the usual to avoid instabilities. At the same time action repeat increased to twice its usual.") # if "time_unit" in locals(): #hack In HalfCheetah, this is needed because the reward function is dependent on the time_unit because it depends on velocity achieved which depends on amount of time torque was applied. In Pusher, Reacher, it is also needed because the reward is similar to the distance from current position to goal at _each_ step, which means if we calculate the reward multiple times in the same amount of "real" time, we'd need to average out the reward the more times we calculate the reward in the same amount of "real" time (i.e., when we have shorter acting timesteps). This is not the case with the toy enviroments because there the reward is amount of distance moved from current position to goal in the current timestep, so it's dependent on "real" time and not on acting timesteps. # self.frame_skip *= time_unit # self.frame_skip = int(self.frame_skip) # self._ctrl_cost_weight *= time_unit # self._forward_reward_weight *= time_unit # print("Setting Mujoco self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight to", self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight, "corresponding to time_unit in config.") def step(self, action): # next_state, reward, done, info = super(GymEnvWrapper, self).step(action) self.total_transitions_episode += 1 if (self.config["state_space_type"] == "discrete" and self.transition_noise > 0.0): probs = (np.ones(shape=(self.env.action_space.n, )) * self.transition_noise / (self.env.action_space.n - 1)) probs[action] = 1 - self.transition_noise old_action = action action = int( self.np_random.choice(self.env.action_space.n, size=1, p=probs)) # random if old_action != action: # print("NOISE inserted", old_action, action) self.total_noisy_transitions_episode += 1 else: # cont. envs pass # TODO # self.total_abs_noise_in_transition_episode += np.abs(noise_in_transition) if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": next_state, reward, done, info = self.env.step(action[0]) next_state_irr, _, done_irr, _ = self.irr_toy_env.step( action[1]) next_state = tuple([next_state, next_state_irr]) else: next_state, reward, done, info = self.env.step( action[:self.env_act_shape[0]]) next_state_irr, _, done_irr, _ = self.irr_toy_env.step( action[self.env_act_shape[0]:]) next_state = np.concatenate((next_state, next_state_irr)) else: next_state, reward, done, info = self.env.step(action) if done: # if episode is finished return the rewards that were delayed and not # handed out before ##TODO add test case for this reward = np.sum(self.reward_buffer) else: self.reward_buffer.append(reward) old_reward = reward reward = self.reward_buffer[0] # print("rewards:", self.reward_buffer, old_reward, reward) del self.reward_buffer[0] # random ###TODO Would be better to parameterise this in terms of state, # action and time_step as well. Would need to change implementation to # have a queue for the rewards achieved and then pick the reward that was # generated delay timesteps ago. noise_in_reward = (self.reward_noise(self.np_random) if self.reward_noise else 0) self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward) self.total_reward_episode += reward reward += noise_in_reward return next_state, reward, done, info def reset(self): # on episode "end" stuff (to not be invoked when reset() called when # self.total_episodes = 0; end is in quotes because it may not be a true # episode end reached by reaching a terminal state, but reset() may have # been called in the middle of an episode): if not self.total_episodes == 0: print( "Noise stats for previous episode num.: " + str(self.total_episodes) + " (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): " + str(self.total_abs_noise_in_reward_episode) + " " + str(self.total_abs_noise_in_transition_episode) + " " + str(self.total_reward_episode) + " " + str(self.total_noisy_transitions_episode) + " " + str(self.total_transitions_episode)) # on episode start stuff: self.reward_buffer = [0.0] * (self.delay) self.total_episodes += 1 self.total_abs_noise_in_reward_episode = 0 self.total_abs_noise_in_transition_episode = ( 0 # only present in continuous spaces ) self.total_noisy_transitions_episode = 0 # only present in discrete spaces self.total_reward_episode = 0 self.total_transitions_episode = 0 if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": reset_state = self.env.reset() reset_state_irr = self.irr_toy_env.reset() reset_state = tuple([reset_state, reset_state_irr]) else: reset_state = self.env.reset() reset_state_irr = self.irr_toy_env.reset() reset_state = np.concatenate((reset_state, reset_state_irr)) else: reset_state = self.env.reset() return reset_state # return super(GymEnvWrapper, self).reset() def seed(self, seed=None): """Initialises the Numpy RNG for the environment by calling a utility for this in Gym. Parameters ---------- seed : int seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it. Returns ------- int The seed returned by Gym """ # If seed is None, you get a randomly generated seed from gym.utils... self.np_random, self.seed_ = gym.utils.seeding.np_random( seed) # random print("Env SEED set to: " + str(seed) + ". Returned seed from Gym: " + str(self.seed_)) return self.seed_
env = FrameStack(env, 4) agent = Agent(env.action_space.n, env.observation_space.shape) num_batch = 32 frames = 0 rand_frames = 0 scores = [] writer = SummaryWriter('Python Scripts/reward_ddqn') noop_max = 30 num_episode = 10000 while True: done = False state = env.reset() while not done: action = env.action_space.sample() next_state, reward, done, info = env.step(action) agent.replay_buffer.append((state, action, reward, next_state, done)) state = next_state rand_frames += 1 print("Random Frame: " + str(rand_frames)) if rand_frames > 25000: break for episode in range(num_episode): done = False state = env.reset() score = 0 while not done: action = agent.choose_action(state) next_state, reward, done, info = env.step(action) agent.replay_buffer.append((state, action, reward, next_state, done))