Beispiel #1
0
 def reset(self):
     Agent.reset(self)
     self.Q = self.model_lambda()
     self.target_Q = self.model_lambda()
     self.target_Q.set_weights(self.Q.get_weights())
     self.buffer.reset()
     self.updates_since_target_updated = 0
Beispiel #2
0
def train(nb_steps: int, env: Env, agent: Agent, start_obs: Arrayable):
    """Trains for one epoch.

    :args nb_steps: number of interaction steps
    :args env: environment
    :args agent: interacting agent
    :start_obs: starting observation

    :return: final observation
    """
    agent.train()
    agent.reset()
    obs = start_obs
    for _ in range(nb_steps):
        # interact
        obs, _, _ = interact(env, agent, obs)
    return obs
Beispiel #3
0
def evaluate(
        dt: float,
        epoch: int,
        env: Env,
        agent: Agent,
        eval_gap: float,  # noqa: C901
        time_limit: Optional[float] = None,
        eval_return: bool = False,
        progress_bar: bool = False,
        video: bool = False,
        no_log: bool = False,
        test: bool = False,
        eval_policy: bool = True) -> Optional[float]:
    """Evaluate agent in environment.

    :args dt: time discretization
    :args epoch: index of the current epoch
    :args env: environment
    :args agent: interacting agent
    :args eval_gap: number of normalized epochs (epochs divided by dt)
        between training steps
    :args time_limit: maximal physical time (number of steps divided by dt)
        spent in the environment
    :args eval_return: do we only perform specific evaluation?
    :args progress_bar: use a progress bar?
    :args video: log a video of the interaction?
    :args no_log: do we log results
    :args test: log to a different test summary
    :args eval_policy: if the exploitation policy is noisy,
        remove the noise before evaluating

    :return: return evaluated, None if no return is evaluated
    """
    log_gap = int(eval_gap / dt)
    agent.eval()
    if not eval_policy and isinstance(agent, OnlineAgent):
        agent.noisy_eval()
    agent.reset()
    R = None
    if eval_return:
        rewards, dones = [], []
        imgs = []
        time_limit = time_limit if time_limit else 10
        nb_steps = int(time_limit / dt)
        info(f"eval> evaluating on a physical time {time_limit}"
             f" ({nb_steps} steps in total)")
        obs = env.reset()
        iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps)
        for _ in iter_range:
            obs, reward, done = interact(env, agent, obs)
            rewards.append(reward)
            dones.append(done)
            if video:
                imgs.append(env.render(mode='rgb_array'))
        R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0))
        tag = "noisy" if not eval_policy else ""
        info(f"eval> At epoch {epoch}, {tag} return: {R}")
        if not no_log:
            if not eval_policy:
                log("Return_noisy", R, epoch)
            elif not video:  # don't log when outputing video
                if not test:
                    log("Return", R, epoch)
                else:
                    log("Return_test", R, epoch)
        if video:
            log_video("demo", epoch, np.stack(imgs, axis=0))

    if not no_log:
        specific_evaluation(epoch, log_gap, dt, env, agent)
    return R