def init_env(self): env = ImgObsWrapper(self.init()) env.reset() print("agent pos: {}".format(env.agent_pos)) self.action_space = env.action_space self.action_dim = env.action_space.n self.obs_dim = env.observation_space.shape return env
sampler = Sampler(env) optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate) loss_fn = nn.MSELoss() print(device.__repr__()) print(q_network) print(f"Using {torch.cuda.device_count()} GPUS") n_params = sum([p.numel() for p in q_network.parameters()]) writer.add_scalar("n_params", n_params) print("Number of parameters:", n_params) # TRY NOT TO MODIFY: start the game obs = env.reset() episode_reward = 0 n = torch.zeros((1, 1)) for global_step in range(args.total_timesteps): # ALGO LOGIC: put action logic here epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step) obs = np.array(obs) env.render() # action, logits, _, = sampler.sample(q_network, obs, device, n, epsilon) logits = q_network.forward(obs.reshape((1, ) + obs.shape), device) if random.random() < epsilon: action = env.action_space.sample() else:
def main(args): env = gym.make(args.env) if 'MiniGrid' in args.env: env = ImgObsWrapper(env) path = args.base_path + args.env os.makedirs(path, exist_ok=True) # obs_shape = np.prod(env.observation_space.shape).astype(int) obs_shape = env.observation_space.shape act_shape = env.action_space.n q = QNetwork(obs_shape, act_shape) q_target = QNetwork(obs_shape, act_shape) opt = optim.Adam(lr=args.lr, params=q.parameters()) memory = Memory(capacity=args.memory) scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01) avg_rw = deque(maxlen=40) avg_len = deque(maxlen=40) def get_action(s, t): s = torch.Tensor(s[None,:]) _q = q(s) if np.random.sample() > scheduler.value: best_action = np.argmax(_q.detach(), axis=-1).item() else: best_action = np.random.randint(0, act_shape) scheduler.update(t) return best_action def train(batch): batch = Transition(*zip(*batch)) s = torch.Tensor(batch.state) a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape)) r = torch.Tensor(batch.reward) d = torch.Tensor(batch.done) s1 = torch.Tensor(batch.next_state) value = (q(s) * a).sum(dim=-1) next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0] loss = (.5 * (next_value - value) ** 2).mean() opt.zero_grad() loss.backward() opt.step() state = env.reset() q_target.load_state_dict(q.state_dict()) ep_rw = 0 ep_len = 0 ep = 0 for t in range(args.max_steps): action = get_action(state, t) next_state, reward, done, _ = env.step(action) memory.push(state, action, next_state, reward, done) ep_rw += reward ep_len += 1 state = next_state.copy() if done: ep += 1 avg_rw.append(ep_rw) avg_len.append(ep_len) ep_rw = 0 ep_len = 0 state = env.reset() if t % args.train_every == 0 and len(memory) > args.batch_size: batch = memory.sample(batch_size=args.batch_size) train(batch) if t % args.update_every == 0: q_target.load_state_dict(q.state_dict()) print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}') env = Monitor(env, directory=path) for ep in range(4): s = env.reset() while True: a = get_action(s, t=0) s1, r, d, _ = env.step(a) s = s1.copy() if d: break
class GridEnvironment(Environment): def __init__(self, env_id, is_render, env_idx, child_conn, history_size=1, h=84, w=84, life_done=True, sticky_action=False, p=0.25): super(GridEnvironment, self).__init__() self.daemon = True self.env = ImgObsWrapper( RGBImgObsWrapper(ReseedWrapper(gym.make(env_id)))) self.env_id = env_id self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(GridEnvironment, self).run() while True: action = self.child_conn.recv() # sticky action if self.sticky_action: if np.random.rand() <= self.p: action = self.last_action self.last_action = action s, reward, done, info = self.env.step(action) if max_step_per_episode < self.steps: done = True log_reward = reward force_done = done self.history[0, :, :] = self.pre_proc(s) self.rall += reward self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Visited Room: [{}]" .format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info.get('episode', {}).get('visited_rooms', {}))) self.history = self.reset() self.child_conn.send([ self.history[:, :, :], reward, force_done, done, log_reward, [self.rall, self.steps] ]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 s = self.env.reset() self.get_init_state(self.pre_proc(s)) return self.history[:, :, :] def pre_proc(self, X): X = np.array(Image.fromarray(X).convert('L')).astype('float32') x = cv2.resize(X, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)