Example #1
0
 def plan(self, state, observation):
     for i in range(self.config['iterations']):
         if (i + 1) % 10 == 0:
             logger.debug('{} / {}'.format(i + 1,
                                           self.config['iterations']))
         self.run(safe_deepcopy_env(state), observation)
     return self.get_plan()
    def get_trajectories(self, initial_state, initial_observation=None,
                         as_observations=True, full_trajectories=True, include_leaves=True):
        """
            Get a list of visited nodes/states/trajectories corresponding to the node subtree

        :param initial_state: the state at the root
        :param initial_observation: the observation for the root state
        :param as_observations: return nodes instead of observations
        :param full_trajectories: return a list of observation sequences, else a list of observations
        :param include_leaves: include leaves or only expanded nodes
        :return: the list of trajectories
        """
        trajectories = []
        if initial_observation is None:
            initial_observation = initial_state.reset()
        if not as_observations:
            initial_observation = self  # Return this node instead of this observation
        if self.children:
            for action, child in self.children.items():
                next_state = safe_deepcopy_env(initial_state)
                next_observation, _, _, _ = next_state.step(action)
                child_trajectories = child.get_trajectories(next_state, next_observation,
                                                            as_observations, full_trajectories, include_leaves)
                if full_trajectories:
                    trajectories.extend([[initial_observation] + trajectory for trajectory in child_trajectories])
                else:
                    trajectories.extend(child_trajectories)
            if not full_trajectories:
                trajectories.append(initial_observation)
        elif include_leaves:
            trajectories = [[initial_observation]] if full_trajectories else [initial_observation]
        return trajectories
Example #3
0
    def expand(self, next_layer, count=1):
        """
            Expand the node by querying the oracle model for every possible action
        :param next_layer: list of nodes at the next depth, to be updated with new children nodes
        :param count: number of times each transition must be evaluated
        """
        if self.state is None:
            raise Exception("The state should be set before expanding a node")
        try:
            actions = self.state.get_available_actions()
        except AttributeError:
            actions = range(1, self.state.action_space.n)

        self.planner.openings += count

        if self.done and PlaTyPOOSNode.STOP_ON_ANY_TERMINAL_STATE:
            return

        for _ in range(count):
            for action in actions:
                state = safe_deepcopy_env(self.state)
                state.seed(self.planner.np_random.randint(2**30))
                _, reward, done, _ = state.step(action)

                if action not in self.children:
                    self.children[action] = type(self)(self,
                                                       self.planner,
                                                       state,
                                                       depth=self.depth + 1)
                    next_layer.append(self.children[action])

                self.children[action].update(reward, done)
    def robustify_env(self):
        """
            Important distinction with RobustEPC: the nominal lpv model is stabilized.

            We start with a system:
                dx = A(theta)x + Bu + omega,
            that we first stabilize with u0 = Kx, without constraint satisfaction.
            Then, we predict the interval of the stabilized system under additional controls:
                dx = (A(theta) + BK)x + Bu' + omega
            where A0 + BK is stable, which eases the similarity transformation to a Metlzer system.
        """
        from highway_env.interval import LPV
        a0, da = self.config["A0"], self.config["dA"]
        K = 2 * self.feedback.K0[:, :(self.feedback.K0.shape[1] // 2)]
        da = da / 100
        # da = [np.zeros(a0.shape)]
        lpv = LPV(a0=a0,
                  da=da,
                  x0=self.env.unwrapped.state.squeeze(-1),
                  b=self.B,
                  d=self.config["D"],
                  k=K,
                  omega_i=self.config["omega"])
        robust_env = safe_deepcopy_env(self.env)
        robust_env.unwrapped.lpv = lpv
        robust_env.unwrapped.automatic_record_callback = None
        return robust_env
Example #5
0
    def plan(self, observation):
        action_distribution = Normal(
            torch.zeros(self.config["horizon"], self.action_size),
            torch.ones(self.config["horizon"], self.action_size))
        for i in range(self.config["iterations"]):
            # Evaluate J action sequences from the current belief (in batch)
            actions = action_distribution.sample([self.config["candidates"]
                                                  ])  # Sample actions
            candidates = [
                safe_deepcopy_env(self.env)
                for _ in range(self.config["candidates"])
            ]
            returns = torch.zeros(self.config["candidates"])
            # Sample next states
            for t in range(self.config["horizon"]):
                for c, candidate in enumerate(candidates):
                    _, reward, _, _ = candidate.step(actions[c, t])
                    returns[c] += self.config["gamma"]**t * reward

            # Re-fit belief to the K best action sequences
            _, topk = returns.topk(self.config["top_candidates"],
                                   largest=True,
                                   sorted=False)  # K ← argsort({R(j)}
            best_actions = actions[topk]
            # Update belief with new means and standard deviations
            action_distribution = Normal(
                best_actions.mean(dim=0),
                best_actions.std(dim=0, unbiased=False))
        # Return first action mean µ_t
        return action_distribution.mean.tolist()
Example #6
0
    def plan(self, state, observation):
        for self.episode in range(self.config['episodes']):
            if (self.episode+1) % max(self.config['episodes'] // 10, 1) == 0:
                logger.debug('{} / {}'.format(self.episode+1, self.config['episodes']))
            self.run(safe_deepcopy_env(state))

        return self.get_plan()
    def plan(self, state, observation):
        self.root.state = safe_deepcopy_env(state)
        self.root.state.seed()
        # self.root.state = state
        for epoch in np.arange(self.config["budget"] // state.action_space.n):
            logger.debug("Expansion {}/{}".format(
                epoch + 1, self.config["budget"] // state.action_space.n))
            self.run()

        return self.get_plan()
Example #8
0
 def expand(self):
     try:
         actions = self.state.get_available_actions()
     except AttributeError:
         actions = range(self.state.action_space.n)
     for action in actions:
         # Simulate transition
         state = safe_deepcopy_env(self.state)
         next_observation, reward, done, _ = self.planner.step(state, action)
         # Record the transition
         next_node = self.planner.get_node(next_observation)
         next_node.state = state
         next_node.parents.add(self)
         self.rewards[action] = reward
         self.children[action] = next_node
Example #9
0
 def expand(self):
     self.planner.leaves.remove(self)
     if self.state is None:
         raise Exception("The state should be set before expanding a node")
     try:
         actions = self.state.get_available_actions()
     except AttributeError:
         actions = range(self.state.action_space.n)
     for action in actions:
         self.children[action] = type(self)(self,
                                            self.planner,
                                            state=safe_deepcopy_env(self.state),
                                            depth=self.depth + 1)
         observation, reward, done, _ = self.planner.step(self.children[action].state, action)
         self.planner.leaves.append(self.children[action])
         self.children[action].update(reward, done, observation)
Example #10
0
    def expand(self, state, leaves, update_children=False):
        if state is None:
            raise Exception("The state should be set before expanding a node")
        try:
            actions = state.get_available_actions()
        except AttributeError:
            actions = range(state.action_space.n)
        for action in actions:
            self.children[action] = type(self)(self, self.planner)
            if update_children:
                _, reward, done, _ = safe_deepcopy_env(state).step(action)
                self.children[action].update(reward, done)

        idx = leaves.index(self)
        leaves = leaves[:idx] + list(self.children.values()) + leaves[idx + 1:]
        return leaves
Example #11
0
 def get_obs_visits(self, state=None):
     visits = defaultdict(int)
     updates = defaultdict(int)
     if hasattr(self, "observation"):
         for node in self.get_trajectories(full_trajectories=False,
                                           include_leaves=False):
             if hasattr(node, "observation"):
                 visits[str(node.observation)] += 1
                 if hasattr(node, "updates_count"):
                     updates[str(node.observation)] += node.updates_count
     else:  # Replay required
         for node in self.get_trajectories(full_trajectories=False,
                                           include_leaves=False):
             replay_state = safe_deepcopy_env(state)
             for action in node.path():
                 observation, _, _, _ = replay_state.step(action)
             visits[str(observation)] += 1
     return visits, updates
Example #12
0
    def plan(self, state, observation):
        done = False
        episode = 0
        while not done:
            best, challenger = self.run(safe_deepcopy_env(state))

            # Stopping rule
            done = challenger.value_upper - best.value_lower < self.config[
                "accuracy"] if best is not None else False
            done = done or episode > self.config["episodes"]

            episode += 1
            if episode % 10 == 0:
                logger.debug('Episode {}: delta = {}/{}'.format(
                    episode, challenger.value_upper - best.value_lower,
                    self.config["accuracy"]))
        self.budget_used = episode * self.config["horizon"]
        return self.get_plan()
    def estimateQ(self, state, action):
        if self.depth == self.planner.config["horizon"]:
            return
        # logger.debug(f"Run estimateQ at {state.mdp.state},{action} with depth {self.depth}")

        for i in range(self.planner.config["C"]):
            next_state = safe_deepcopy_env(state)
            # We need randomness
            next_state.seed(self.planner.np_random.randint(2**30))

            observation, reward, done, _ = next_state.step(action)
            # observation = str(observation) + str(i)  # Prevent state merge
            self.get_child(observation).count += 1
            self.get_child(observation).state = next_state
        for next_state_node in self.children.values():
            next_state_node.estimateV(next_state_node.state)
        self.value = reward + self.planner.config["gamma"] * sum(
            next_state_node.value * next_state_node.count for next_state_node
            in self.children.values()) / self.planner.config["C"]
Example #14
0
 def robustify_env(self):
     """
         Make a robust version of the environment:
             1. compute the dynamics polytope (A0, dA)
             2. set the LPV interval predictor, so that it can be stepped with the environment
             3. the environment, when provided with an interval predictor, should return pessimistic rewards
             4. disable the recording of environment transitions, since we are not observing when planning.
     :return: the robust version of the environment.
     """
     a0, da = self.polytope()
     from highway_env.interval import LPV
     lpv = LPV(a0=a0,
               da=da,
               x0=self.env.unwrapped.state.squeeze(-1),
               b=self.B,
               d=self.config["D"],
               omega_i=self.config["omega"])
     robust_env = safe_deepcopy_env(self.env)
     robust_env.unwrapped.lpv = lpv
     robust_env.unwrapped.automatic_record_callback = None
     return robust_env
Example #15
0
def evaluate(experiment):
    # Prepare workspace
    seed, agent_config, env_config, path = experiment
    gym.logger.set_level(gym.logger.DISABLED)
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    # Make environment
    env = load_environment(env_config)

    # Make agent
    agent_name, agent_config = agent_config
    agent = load_agent(agent_config, env)

    # Evaluate
    print("Evaluating agent {} on seed {}".format(agent_name, seed))
    evaluation = Evaluation(env,
                            agent,
                            directory=path.parent / agent_name,
                            num_episodes=1,
                            sim_seed=seed,
                            display_env=True,
                            display_agent=True,
                            display_rewards=False)
    estimate_value = False
    if estimate_value:
        rewards, values, terminal = [], [], False
        evaluation.seed(episode=0)
        evaluation.reset()
        evaluation.training = False
        gamma = 0.99 or agent.config["gamma"]
        while not terminal:
            # Estimate state value
            oracle_env = safe_deepcopy_env(agent.env)
            oracle = load_agent(agent_configs()["oracle"], oracle_env)
            oracle_done, oracle_rewards = False, []
            while not oracle_done:
                action = oracle.act(None)
                _, oracle_reward, oracle_done, _ = oracle_env.step(action)
                oracle_rewards.append(oracle_reward)
            value = np.sum([
                gamma**t * oracle_rewards[t]
                for t in range(len(oracle_rewards))
            ])
            values.append(value)

            reward, terminal = evaluation.step()
            rewards.append(reward)
        evaluation.close()

        returns = [
            np.sum(
                [gamma**t * rewards[k + t] for t in range(len(rewards[k:]))])
            for k in range(len(rewards))
        ]

        # Save intermediate results
        df = pd.DataFrame({
            "agent": agent_name,
            "time": range(len(rewards)),
            "seed": [seed] * len(rewards),
            "reward": rewards,
            "return": returns,
            "value": values
        })
    else:
        evaluation.test()
        rewards = evaluation.monitor.stats_recorder.episode_rewards_[0]
        length = evaluation.monitor.stats_recorder.episode_lengths[0]
        total_reward = np.sum(rewards)

        cum_discount = lambda signal, gamma: np.sum(
            [gamma**t * signal[t] for t in range(len(signal))])
        return_ = cum_discount(rewards, 0.9)
        return_undisc = cum_discount(rewards, 0.99)
        result = {
            "agent": agent_name,
            "seed": seed,
            "total_reward": total_reward,
            "return": return_,
            "return_undisc": return_undisc,
            "length": length,
        }
        df = pd.DataFrame.from_records([result])
    with open(path, 'a') as f:
        df.to_csv(f,
                  sep=',',
                  encoding='utf-8',
                  header=f.tell() == 0,
                  index=False)
Example #16
0
 def plan(self, state, observation):
     self.available_budget = self.config["budget"]
     while self.available_budget > 0:
         rollout = self.rollout(safe_deepcopy_env(state), observation)
         self.update(rollout)
     return self.get_plan()
    def plan(self, state, observation):
        self.root = self.get_node(observation, state=state)
        for _ in np.arange(self.config["episodes"]):
            self.run(safe_deepcopy_env(state), observation)

        return self.get_plan()