def __init__(self, env_id, is_render, env_idx, child_conn, history_size=1, h=84, w=84, life_done=True, sticky_action=False, p=0.25): super(GridEnvironment, self).__init__() self.daemon = True self.env = ImgObsWrapper( RGBImgObsWrapper(ReseedWrapper(gym.make(env_id)))) self.env_id = env_id self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset()
def which_env(name): if 'Grid' in name: env = ImgObsWrapper(gym.make(name)) test_env = ImgObsWrapper(gym.make(name)) else: env = make_atari(name) test_env = make_atari(name) return env, test_env, (env.observation_space.shape, env.action_space.n)
def init_env(self): env = ImgObsWrapper(self.init()) env.reset() print("agent pos: {}".format(env.agent_pos)) self.action_space = env.action_space self.action_dim = env.action_space.n self.obs_dim = env.observation_space.shape return env
def thunk(): env = gym.make(gym_id) env = ImgObsWrapper(env) env = gym.wrappers.RecordEpisodeStatistics(env) if args.capture_video: if idx == 0: env = Monitor(env, f'videos/{experiment_name}') env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env
def main(argv): env = e_lib.EmptyMultigoal(size=FLAGS.size, n_goals=FLAGS.n_goals, n_traps=FLAGS.n_traps) env = e_lib.SymbolicObsWrapper(env) env = ImgObsWrapper(env) env = helx.rl.environment.from_gym(env) n_features = jnp.prod(env.observation_spec().shape) logger = helx.logging.TerminalLogger() agent = a_lib.SarsaLambda(env, FLAGS.alpha, FLAGS.lamda, n_features, logger) helx.rl.experiment.run(env, agent, FLAGS.train_episodes) helx.rl.experiment.run(env, agent, FLAGS.eval_episodes, True)
def inner_objective( ind: cgp.IndividualSingleGenome, network_params: dict, curriculum_params: dict, seeds ) -> float: rule = ind.to_torch() reward_per_seed = [] reward_per_seed_mean = [] for seed in seeds: seed = int(seed) torch.manual_seed(seed=seed) rng = np.random.default_rng(seed=seed) # environment and network initialization env = DynamicMiniGrid(seed=seed) env = ImgObsWrapper(env) state = env.respawn()["image"][:,:,0].flatten() policy_net = Network(n_inputs=np.size(state), **network_params) rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng) reward_per_seed.append(rewards_over_alterations) reward_per_seed_mean.append(np.mean(rewards_over_alterations)) ind.reward_matrix = reward_per_seed reward_mean = np.mean(reward_per_seed_mean) return float(reward_mean)
def get_env(env_name): env = gym.make(env_name) if env_name.startswith('MiniGrid'): env = ImgObsWrapper(env) # TODO include atari here or put get_env in gym_utils # env = make_atari('SpaceInvadersNoFrameskip-v0') # env = WrapFrame(env) return env
def mini_grid_wrapper(env_id, max_frames=0, clip_rewards=True): env = gym.make(env_id) env = ReseedWrapper(env, seeds=[0]) env = RGBImgObsWrapper(env) env = ImgObsWrapper(env) if max_frames: env = pfrl.wrappers.ContinuingTimeLimit( env, max_episode_steps=max_frames) # env = atari_wrappers.MaxAndSkipEnv(env, skip=0) env = atari_wrappers.wrap_deepmind( env, episode_life=False, clip_rewards=clip_rewards) return env
def __init__(self, name, horizon=None, gamma=0.99, history_length=4, fixed_seed=None, use_pixels=False): """ Constructor. Args: name (str): name of the environment; horizon (int, None): the horizon; gamma (float, 0.99): the discount factor; history_length (int, 4): number of frames to form a state; fixed_seed (int, None): if passed, it fixes the seed of the environment at every reset. This way, the environment is fixed rather than procedurally generated; use_pixels (bool, False): if True, MiniGrid's default 7x7x3 observations is converted to an image of resolution 56x56x3. """ # MDP creation self._not_pybullet = True self._first = True env = gym.make(name) obs_high = 10. if use_pixels: env = RGBImgPartialObsWrapper(env) # Get pixel observations obs_high = 255. env = ImgObsWrapper(env) # Get rid of the 'mission' field self.env = env self._fixed_seed = fixed_seed self._img_size = env.observation_space.shape[0:2] self._history_length = history_length # Get the default horizon if horizon is None: horizon = self.env.max_steps # MDP properties action_space = Discrete(self.env.action_space.n) observation_space = Box( low=0., high=obs_high, shape=(history_length, self._img_size[1], self._img_size[0])) self.env.max_steps = horizon + 1 # Hack to ignore gym time limit (do not use np.inf, since MiniGrid returns r(t) = 1 - 0.9t/T) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) Environment.__init__(self, mdp_info) self._state = None
def calculate_validation_fitness(champion, seed, network_params, curriculum_params): rule = champion.to_torch() torch.manual_seed(seed=seed) rng = np.random.default_rng(seed=seed) # environment and network initialization env = DynamicMiniGrid(seed=seed) env = ImgObsWrapper(env) state = env.respawn()["image"][:, :, 0].flatten() policy_net = Network(n_inputs=np.size(state), **network_params) rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng) return rewards_over_alterations
def get_env_constructor(self, env_name): env_type = self.get_env_type(env_name) if env_type == 'mg': constructor = lambda: MiniGridRewardNormalize( ImgObsWrapper(gym.make(env_name)), scale=self.env_infos[env_name].reward_scale, shift=self.env_infos[env_name].reward_shift) elif env_type == 'gym': constructor = lambda: GymRewardNormalize( gym.make(env_name), scale=self.env_infos[env_name].reward_scale, shift=self.env_infos[env_name].reward_shift) elif env_type in ['tab', 'vcomp']: constructor = self.env_infos[env_name].constructor else: assert False return constructor
def set_env(params): if params.env == 'hanoi': from hanoi_env.env import HanoiEnv params.model_type = 'rnn' env = HanoiEnv() env.set_env_parameters(max_count=params.max_count, num_disks=params.num_disks, num_pegs=params.num_pegs, allow_impossible=params.allow_impossible, continual=params.continual, initial_peg=params.initial_peg) elif params.env == 'lightbot_minigrid': from gym_minigrid.envs import LightbotEnv as LightbotMinigridEnv from gym_minigrid.wrappers import ImgObsWrapper, AgentViewWrapper params.model_type = 'cnn' env = LightbotMinigridEnv(params.puzzle_name, reward_fn=params.rewards, max_steps=params.max_count, toggle_ontop=False) env = ImgObsWrapper(AgentViewWrapper(env, agent_view_size=9)) elif params.env == 'lightbot': from lightbot_env.env import LightbotEnv params.model_type = 'mlp' env = LightbotEnv(params.puzzle_name) env.set_env_parameters(max_count=params.max_count, testing=params.testing, reward_fn=params.rewards, random_init=params.random_init, allow_impossible=params.allow_impossible) elif params.env == 'fourrooms': from fourrooms.fourrooms import Fourrooms params.model_type = 'mlp' env = Fourrooms(max_count=params.max_count) elif params.env == 'fourrooms_minigrid': params.model_type = 'cnn' raise NotImplementedError return env, params
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') gin.parse_config_files_and_bindings( [os.path.join(mon_minigrid.GIN_FILES_PREFIX, 'classic_fourrooms.gin')], bindings=FLAGS.gin_bindings, skip_unknown=False) env_id = mon_minigrid.register_environment() env = gym.make(env_id) env = RGBImgObsWrapper(env) # Get pixel observations env = ImgObsWrapper(env) # Get rid of the 'mission' field env.reset() num_frames = 0 max_num_frames = 500 if not tf.io.gfile.exists(FLAGS.file_path): tf.io.gfile.makedirs(FLAGS.file_path) undisc_return = 0 while num_frames < max_num_frames: # Act randomly obs, reward, done, _ = env.step(env.action_space.sample()) undisc_return += reward num_frames += 1 # Draw environment frame just for simple visualization plt.imshow(obs) path = os.path.join(FLAGS.file_path, 'obs_{}.png'.format(num_frames)) plt.savefig(path) plt.clf() if done: break print('Undiscounted return: %.2f' % undisc_return) env.close()
def get_env_constructor(self, env_name): env_type = self.get_env_type(env_name) if env_type == 'mg': constructor = lambda: MiniGridRewardNormalize( ImgObsWrapper(gym.make(env_name)), scale=self.env_infos[env_name].reward_scale, shift=self.env_infos[env_name].reward_shift) elif env_type == 'gym': constructor = lambda: GymRewardNormalize( gym.make(env_name), scale=self.env_infos[env_name].reward_scale, shift=self.env_infos[env_name].reward_shift) elif env_type == 'tab': """ Here you should explicitly design the reward structure """ constructor = self.env_infos[env_name].constructor else: assert False return constructor
self.grid.wall_rect(0, 0, width, height) # Place the goals for _ in range(self.n_goals): self.place_obj(Goal()) # Place the traps for _ in range(self.n_traps): self.place_obj(Lava()) # Place the agent if self.agent_start_pos is not None: self.agent_pos = self.agent_start_pos self.agent_dir = self.agent_start_dir else: self.place_agent() self.mission = "get to the green goal square, avoid the lava" if __name__ == "__main__": # debugging env = EmptyMultigoal(size=5, n_goals=1, n_traps=1) env = SymbolicObsWrapper(env) env = PartialObsWrapper(env, agent_view_size=1) env = ImgObsWrapper(env) o = env.reset() print(o.shape) env.render() print(o)
def main(args): env = gym.make(args.env) if 'MiniGrid' in args.env: env = ImgObsWrapper(env) path = args.base_path + args.env os.makedirs(path, exist_ok=True) # obs_shape = np.prod(env.observation_space.shape).astype(int) obs_shape = env.observation_space.shape act_shape = env.action_space.n q = QNetwork(obs_shape, act_shape) q_target = QNetwork(obs_shape, act_shape) opt = optim.Adam(lr=args.lr, params=q.parameters()) memory = Memory(capacity=args.memory) scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01) avg_rw = deque(maxlen=40) avg_len = deque(maxlen=40) def get_action(s, t): s = torch.Tensor(s[None,:]) _q = q(s) if np.random.sample() > scheduler.value: best_action = np.argmax(_q.detach(), axis=-1).item() else: best_action = np.random.randint(0, act_shape) scheduler.update(t) return best_action def train(batch): batch = Transition(*zip(*batch)) s = torch.Tensor(batch.state) a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape)) r = torch.Tensor(batch.reward) d = torch.Tensor(batch.done) s1 = torch.Tensor(batch.next_state) value = (q(s) * a).sum(dim=-1) next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0] loss = (.5 * (next_value - value) ** 2).mean() opt.zero_grad() loss.backward() opt.step() state = env.reset() q_target.load_state_dict(q.state_dict()) ep_rw = 0 ep_len = 0 ep = 0 for t in range(args.max_steps): action = get_action(state, t) next_state, reward, done, _ = env.step(action) memory.push(state, action, next_state, reward, done) ep_rw += reward ep_len += 1 state = next_state.copy() if done: ep += 1 avg_rw.append(ep_rw) avg_len.append(ep_len) ep_rw = 0 ep_len = 0 state = env.reset() if t % args.train_every == 0 and len(memory) > args.batch_size: batch = memory.sample(batch_size=args.batch_size) train(batch) if t % args.update_every == 0: q_target.load_state_dict(q.state_dict()) print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}') env = Monitor(env, directory=path) for ep in range(4): s = env.reset() while True: a = get_action(s, t=0) s1, r, d, _ = env.step(a) s = s1.copy() if d: break
import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device( 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) #env = wrap_atari(env) env = ImgObsWrapper(env) #env = gym.wrappers.RecordEpisodeStatistics(env) # records episode reward in `info['episode']['r']` if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') #env = wrap_deepmind( # env, # clip_rewards=True, # frame_stack=True, # scale=False, #) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic
def main(run_id=0, checkpoint=None, rec_interval=10, save_interval=100): print({section: dict(config[section]) for section in config.sections()}) train_method = grid_config['TrainMethod'] # Create environment env_id = grid_config['EnvID'] env_type = grid_config['EnvType'] if env_type == 'mario': print('Mario environment not fully implemented - thomaseh') raise NotImplementedError env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) elif env_type == 'grid': env = ImgObsWrapper(RGBImgObsWrapper(gym.make(env_id))) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() # Load configuration parameters is_load_model = checkpoint is not None is_render = False model_path = 'models/{}_{}_run{}_model'.format(env_id, train_method, run_id) predictor_path = 'models/{}_{}_run{}_vae'.format(env_id, train_method, run_id) writer = SummaryWriter(logdir='runs/{}_{}_run{}'.format(env_id, train_method, run_id)) use_cuda = grid_config.getboolean('UseGPU') use_gae = grid_config.getboolean('UseGAE') use_noisy_net = grid_config.getboolean('UseNoisyNet') lam = float(grid_config['Lambda']) num_worker = int(grid_config['NumEnv']) num_step = int(grid_config['NumStep']) num_rollouts = int(grid_config['NumRollouts']) num_pretrain_rollouts = int(grid_config['NumPretrainRollouts']) ppo_eps = float(grid_config['PPOEps']) epoch = int(grid_config['Epoch']) mini_batch = int(grid_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(grid_config['LearningRate']) entropy_coef = float(grid_config['Entropy']) gamma = float(grid_config['Gamma']) int_gamma = float(grid_config['IntGamma']) clip_grad_norm = float(grid_config['ClipGradNorm']) ext_coef = float(grid_config['ExtCoef']) int_coef = float(grid_config['IntCoef']) sticky_action = grid_config.getboolean('StickyAction') action_prob = float(grid_config['ActionProb']) life_done = grid_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(grid_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) hidden_dim = int(grid_config['HiddenDim']) if train_method == 'RND': agent = RNDAgent elif train_method == 'generative': agent = GenerativeAgent else: raise NotImplementedError if grid_config['EnvType'] == 'atari': env_type = AtariEnvironment elif grid_config['EnvType'] == 'mario': env_type = MarioEnvironment elif grid_config['EnvType'] == 'grid': env_type = GridEnvironment else: raise NotImplementedError # Initialize agent agent = agent( input_size, output_size, num_worker, num_step, gamma, history_size=1, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net, update_proportion=1.0, hidden_dim=hidden_dim ) # Load pre-existing model if is_load_model: print('load model...') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) agent.vae.load_state_dict(torch.load(predictor_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) agent.vae.load_state_dict(torch.load(predictor_path, map_location='cpu')) print('load finished!') # Create workers to run in environments works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type( env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, ) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 1, 84, 84], dtype='float32') sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # Initialize stats dict stats = { 'total_reward': [], 'ep_length': [], 'num_updates': [], 'frames_seen': [], } # Main training loop while True: total_state = np.zeros([num_worker * num_step, 1, 84, 84], dtype='float32') total_next_obs = np.zeros([num_worker * num_step, 1, 84, 84]) total_reward, total_done, total_next_state, total_action, \ total_int_reward, total_ext_values, total_int_values, total_policy, \ total_policy_np = [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout (collect data) for step in range(num_step): actions, value_ext, value_int, policy = agent.get_action(states / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_obs = np.zeros([num_worker, 1, 84, 84]) next_states = np.zeros([num_worker, 1, 84, 84]) rewards, dones, real_dones, log_rewards = [], [], [], [] for idx, parent_conn in enumerate(parent_conns): s, r, d, rd, lr, stat = parent_conn.recv() next_states[idx] = s rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs[idx, 0] = s[0, :, :] total_next_obs[idx * num_step + step, 0] = s[0, :, :] if rd: stats['total_reward'].append(stat[0]) stats['ep_length'].append(stat[1]) stats['num_updates'].append(global_update) stats['frames_seen'].append(global_step) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) # Compute total reward = intrinsic reward + external reward intrinsic_reward = agent.compute_intrinsic_reward(next_obs / 255.) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] for idx, state in enumerate(states): total_state[idx * num_step + step] = state total_int_reward.append(intrinsic_reward) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) writer.add_scalar('data/raw_int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/raw_int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param # obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! # random_obs_choice = np.random.randint(total_next_obs.shape[0]) # random_obs = total_next_obs[random_obs_choice].copy() total_next_obs /= 255. if global_update < num_pretrain_rollouts: recon_losses, kld_losses = agent.train_just_vae(total_state / 255., total_next_obs) else: recon_losses, kld_losses = agent.train_model(total_state / 255., ext_target, int_target, total_action, total_adv, total_next_obs, total_policy) writer.add_scalar('data/reconstruction_loss_per_rollout', np.mean(recon_losses), global_update) writer.add_scalar('data/kld_loss_per_rollout', np.mean(kld_losses), global_update) global_step += (num_worker * num_step) if global_update % rec_interval == 0: with torch.no_grad(): # random_obs_norm = total_next_obs[random_obs_choice] # reconstructed_state = agent.reconstruct(random_obs_norm) # random_obs_norm = (random_obs_norm - random_obs_norm.min()) / (random_obs_norm.max() - random_obs_norm.min()) # reconstructed_state = (reconstructed_state - reconstructed_state.min()) / (reconstructed_state.max() - reconstructed_state.min()) # writer.add_image('Original', random_obs, global_update) # writer.add_image('Original Normalized', random_obs_norm, global_update) random_state = total_next_obs[np.random.randint(total_next_obs.shape[0])] reconstructed_state = agent.reconstruct(random_state) writer.add_image('Original', random_state, global_update) writer.add_image('Reconstructed', reconstructed_state, global_update) if global_update % save_interval == 0: print('Saving model at global step={}, num rollouts={}.'.format( global_step, global_update)) torch.save(agent.model.state_dict(), model_path + "_{}.pt".format(global_update)) torch.save(agent.vae.state_dict(), predictor_path + '_{}.pt'.format(global_update)) # Save stats to pickle file with open('models/{}_{}_run{}_stats_{}.pkl'.format(env_id, train_method, run_id, global_update),'wb') as f: pickle.dump(stats, f) global_update += 1 if global_update == num_rollouts + num_pretrain_rollouts: print('Finished Training.') break
class GridEnvironment(Environment): def __init__(self, env_id, is_render, env_idx, child_conn, history_size=1, h=84, w=84, life_done=True, sticky_action=False, p=0.25): super(GridEnvironment, self).__init__() self.daemon = True self.env = ImgObsWrapper( RGBImgObsWrapper(ReseedWrapper(gym.make(env_id)))) self.env_id = env_id self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(GridEnvironment, self).run() while True: action = self.child_conn.recv() # sticky action if self.sticky_action: if np.random.rand() <= self.p: action = self.last_action self.last_action = action s, reward, done, info = self.env.step(action) if max_step_per_episode < self.steps: done = True log_reward = reward force_done = done self.history[0, :, :] = self.pre_proc(s) self.rall += reward self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Visited Room: [{}]" .format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info.get('episode', {}).get('visited_rooms', {}))) self.history = self.reset() self.child_conn.send([ self.history[:, :, :], reward, force_done, done, log_reward, [self.rall, self.steps] ]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 s = self.env.reset() self.get_init_state(self.pre_proc(s)) return self.history[:, :, :] def pre_proc(self, X): X = np.array(Image.fromarray(X).convert('L')).astype('float32') x = cv2.resize(X, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
def BobEnv(size): return ImgObsWrapper(RGBImgPartialObsWrapper(_BobEnv(size)))
def _wrap_minigrid_env(env): from gym_minigrid.wrappers import ImgObsWrapper env = ImgObsWrapper(env) # Get rid of the 'mission' field env = bench.Monitor(env, logger.get_dir()) return env
def callback(_locals, _globals): n_steps = _locals['_'] if n_steps and (n_steps % 1000 == 0): print(n_steps) print(_locals['episode_successes']) # env.render() # time.sleep(0.2) n_steps += 1 # Returning False will stop training early return True # Create log dir log_dir = f"{EXPERIMENT_DIR}/sb/gym" os.makedirs(log_dir, exist_ok=True) # Create environment env_name = 'MiniGrid-FourRooms-v1' env = FullyObsWrapper(ImgObsWrapper(gym.make(env_name))) env.max_steps = 100000 # env.step = partial(stochastic_step, env) env = DummyVecEnv([lambda: env]) # Train a model model = DQN(policy=MlpPolicy, env=env, tensorboard_log=f"{EXPERIMENT_DIR}/sb/tensorboard/{env_name}") model.learn(total_timesteps=10000000, callback=callback)
def wrap_env(env, opt): env = ImgObsWrapper(env) env = FrameStack(env, k=opt.er.hist_len) env = TorchWrapper(env, device=opt.device) env = SeedWrapper(env, opt.seed) if opt.seed is not None else env return env
'max_timesteps': 500, 'action_mode': 'discrete'} # create env ENV = gym.make('BallBeamThrow-v0', **kwargs) BD_BOUNDS = [[0, 3]] INITIAL_GENOTYPE_SIZE = 11 MINI = False EVALUATE_INDIVIDUAL = evaluate_beam BD_GENOTYPE = 1 if ENV_NAME == 'grid': import gym_minigrid # must still be imported from gym_minigrid.wrappers import ImgObsWrapper # must still be imported # create env ENV = ImgObsWrapper(gym.make('MiniGrid-Empty-8x8-v0')) BD_BOUNDS = [[0, 7], [0, 7]] NB_CELLS = 64 INITIAL_GENOTYPE_SIZE = 11 MINI = False EVALUATE_INDIVIDUAL = evaluate_grid BD_GENOTYPE = 1 if ENV_NAME == 'bipedal': # global variable for the environment ENV = gym.make('BipedalWalker-v3') BD_BOUNDS = [[-1, 1], [0, 1]] INITIAL_GENOTYPE_SIZE = 118 MINI = False EVALUATE_INDIVIDUAL = evaluate_bipedal BD_GENOTYPE = 1
if reward_mean > 500: break def play(self, num_episodes, render=True): """Test the trained agent. """ for episode in range(num_episodes): state = self.env.reset() total_reward = 0.0 while True: if render: self.env.render() action = self.get_action(state) state, reward, done, _ = self.env.step(action) total_reward += reward if done: print( f"Total reward: {total_reward} in episode {episode + 1}" ) break if __name__ == "__main__": env = gym.make("MiniGrid-Empty-8x8-v0") env = RGBImgPartialObsWrapper(env) # Get pixel observations env = ImgObsWrapper(env) # Get rid of the 'mission' field agent = Agent(env) print("Number of actions: ", agent.actions) agent.train(percentile=99.9, num_iterations=64, num_episodes=128) agent.play(num_episodes=3)