コード例 #1
0
for global_step in range(args.total_timesteps):
    # ALGO LOGIC: put action logic here
    epsilon = linear_schedule(args.start_e, args.end_e,
                              args.exploration_fraction * args.total_timesteps,
                              global_step)
    obs = np.array(obs)
    env.render()
    # action, logits, _,  = sampler.sample(q_network, obs, device, n, epsilon)
    logits = q_network.forward(obs.reshape((1, ) + obs.shape), device)
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = torch.argmax(logits, dim=1).tolist()[0]
    # EXPERIMhENTAL PLEASE FIX SOON
    # TRY NOT TO MODIFY: execute the game and log data.
    next_obs, reward, done, info = env.step(action)
    episode_reward += reward
    # TRY NOT TO MODIFY: record rewards for plotting purposes
    # ALGO LOGIC: training.
    # when storing n, we want to keep its computational graph
    # other way of doing it, store:
    #   init_obs, action, reward, (subsequent_obs), done
    # prob - levy sampling is stocastic

    # alternatively, keep the tensor n, keep graph when going back,
    # but do garbage collection

    rb.put((obs, action, reward, next_obs, done))

    if global_step > args.learning_starts and global_step % args.train_frequency == 0:
        s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample(
コード例 #2
0
def main(args):
    env = gym.make(args.env)
    if 'MiniGrid' in args.env:
        env = ImgObsWrapper(env)
    path = args.base_path + args.env
    os.makedirs(path, exist_ok=True)
    # obs_shape = np.prod(env.observation_space.shape).astype(int)
    obs_shape = env.observation_space.shape
    act_shape = env.action_space.n

    q = QNetwork(obs_shape, act_shape)
    q_target = QNetwork(obs_shape, act_shape)
    opt = optim.Adam(lr=args.lr, params=q.parameters())
    memory = Memory(capacity=args.memory)
    scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01)

    avg_rw = deque(maxlen=40)
    avg_len = deque(maxlen=40)

    def get_action(s, t):

        s = torch.Tensor(s[None,:])
        _q = q(s)
        if np.random.sample() > scheduler.value:
            best_action = np.argmax(_q.detach(), axis=-1).item()
        else:
            best_action = np.random.randint(0, act_shape)
            scheduler.update(t)
        return best_action

    def train(batch):
        batch = Transition(*zip(*batch))
        s = torch.Tensor(batch.state)
        a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape))
        r = torch.Tensor(batch.reward)
        d = torch.Tensor(batch.done)
        s1 = torch.Tensor(batch.next_state)

        value = (q(s) * a).sum(dim=-1)
        next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0]
        loss = (.5 * (next_value - value) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

    state = env.reset()

    q_target.load_state_dict(q.state_dict())

    ep_rw = 0
    ep_len = 0
    ep = 0
    for t in range(args.max_steps):
        action = get_action(state, t)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, next_state, reward, done)
        ep_rw += reward
        ep_len += 1

        state = next_state.copy()
        if done:
            ep += 1
            avg_rw.append(ep_rw)
            avg_len.append(ep_len)
            ep_rw = 0
            ep_len = 0
            state = env.reset()

        if t % args.train_every == 0 and len(memory) > args.batch_size:
            batch = memory.sample(batch_size=args.batch_size)
            train(batch)

        if t % args.update_every == 0:
            q_target.load_state_dict(q.state_dict())
            print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}')

    env = Monitor(env, directory=path)

    for ep in range(4):
        s = env.reset()
        while True:
            a = get_action(s, t=0)
            s1, r, d, _ = env.step(a)
            s = s1.copy()
            if d:
                break
コード例 #3
0
class GridEnvironment(Environment):
    def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=1,
                 h=84,
                 w=84,
                 life_done=True,
                 sticky_action=False,
                 p=0.25):
        super(GridEnvironment, self).__init__()
        self.daemon = True
        self.env = ImgObsWrapper(
            RGBImgObsWrapper(ReseedWrapper(gym.make(env_id))))
        self.env_id = env_id
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(GridEnvironment, self).run()
        while True:
            action = self.child_conn.recv()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            s, reward, done, info = self.env.step(action)

            if max_step_per_episode < self.steps:
                done = True

            log_reward = reward
            force_done = done

            self.history[0, :, :] = self.pre_proc(s)

            self.rall += reward
            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Visited Room: [{}]"
                    .format(self.episode, self.env_idx, self.steps, self.rall,
                            np.mean(self.recent_rlist),
                            info.get('episode', {}).get('visited_rooms', {})))

                self.history = self.reset()

            self.child_conn.send([
                self.history[:, :, :], reward, force_done, done, log_reward,
                [self.rall, self.steps]
            ])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        s = self.env.reset()
        self.get_init_state(self.pre_proc(s))
        return self.history[:, :, :]

    def pre_proc(self, X):
        X = np.array(Image.fromarray(X).convert('L')).astype('float32')
        x = cv2.resize(X, (self.h, self.w))
        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)