Example #1
0
class OfflineOpenLoopPlanner(object):
    '''OfflineOpenLoopPlanner implements a gradient-based planner that optimizes
    a sequence of actions in an offline setting.

    Note:
        For details please refer to NIPS 2017 paper:
        "Scalable Planning with Tensorflow for Hybrid Nonlinear Domains".

    Args:
        compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler.
        batch_size (int): The size of the batch used in policy simulation.
        horizon (int): The number of timesteps.
    '''
    def __init__(self, compiler: Compiler, batch_size: int,
                 horizon: int) -> None:
        self._compiler = compiler
        self.batch_size = batch_size
        self.horizon = horizon

    def build(self, learning_rate: int) -> None:
        '''Builds the offline open loop planning ops.'''
        self._build_policy_graph()
        self._build_optimizer_graph(learning_rate)

    def _build_policy_graph(self) -> None:
        '''Builds the open loop policy ops.'''
        self._policy = OpenLoopPolicy(self._compiler, self.batch_size,
                                      self.horizon)
        self._policy.build('planning')

    def _build_optimizer_graph(self, learning_rate: int) -> None:
        '''Builds the action optimizer ops.'''
        self._optimizer = ActionOptimizer(self._compiler, self._policy)
        self._optimizer.build(learning_rate, self.batch_size, self.horizon)

    def run(self,
            epochs: int,
            show_progress: bool = True) -> Tuple[ActionArray, PolicyVarsArray]:
        '''Runs action optimizer for the given number of training `epochs`.

        Args:
            epochs (int): The number of training epochs.
            show_progress (bool): The boolean flag for showing current progress.

        Returns:
            Tuple[ActionArray, PolicyVarsArray]: The sequence of actions and
            policy variables optimized after training.
        '''
        actions, policy_vars = self._optimizer.run(epochs,
                                                   show_progress=show_progress)
        return actions, policy_vars
Example #2
0
    def setUpClass(cls):

        # initialize hyper-parameters
        cls.horizon = 40
        cls.batch_size = 64
        cls.epochs = 50
        cls.learning_rate = 0.01

        # parse RDDL file
        with open('rddl/deterministic/Navigation.rddl') as file:
            parser = RDDLParser()
            parser.build()
            rddl = parser.parse(file.read())
            rddl.build()

        # initializer RDDL2TensorFlow compiler
        cls.rddl2tf = Compiler(rddl, batch_mode=True)

        # initialize open-loop policy
        cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon)
        cls.policy.build('test')

        # initialize ActionOptimizer
        cls.optimizer = ActionOptimizer(cls.rddl2tf, cls.policy)
        cls.optimizer.build(cls.learning_rate, cls.batch_size, cls.horizon)
Example #3
0
 def _build_optimizer_graph(self) -> None:
     '''Builds the action optimizer ops.'''
     self._optimizer = ActionOptimizer(self._compiler, self._policy)
Example #4
0
class OnlineOpenLoopPlanner(object):
    '''OnlineOpenLoopPlanner implements a gradient-based planner that optimizes
    a sequence of actions in an online setting (i.e., interleaving planning and
    execution).

    Args:
        compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler.
        batch_size (int): The size of the batch used in policy simulation.
        horizon (int): The number of timesteps.
        parallel_plans (bool): The boolean flag for optimizing parallel sequence of actions.
    '''

    def __init__(self,
            compiler: Compiler,
            batch_size: int,
            horizon: int,
            parallel_plans: bool = True) -> None:
        self._compiler = compiler
        self.batch_size = batch_size
        self.horizon = horizon
        self.parallel_plans = parallel_plans

    def build(self,
            learning_rate: int,
            epochs: int,
            show_progress: bool = True) -> None:
        '''Builds the online open loop planning ops.'''
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.show_progress = show_progress
        self._build_policy_graph()
        self._build_optimizer_graph()

    def _build_policy_graph(self) -> None:
        '''Builds the open loop policy ops.'''
        self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans)
        self._policy.build('planning')

    def _build_optimizer_graph(self) -> None:
        '''Builds the action optimizer ops.'''
        self._optimizer = ActionOptimizer(self._compiler, self._policy)

    def __call__(self, state: StateTensor, t: int) -> Tuple[Action, PolicyVars]:
        '''Returns action to be executed in current `state` at timestep `t`.

        Args:
            state (StateTensor): The current state.
            t (int): The current timestep.

        Returns:
            Tuple[ActionArray, PolicyVarsArray]: The action and
            policy variables optimized for the current timestep.
        '''

        # initialize action optimizer
        with self._compiler.graph.as_default():
            with tf.name_scope('timestep{}'.format(t)):
                start = time.time()
                self._optimizer.build(self.learning_rate, self.batch_size, self.horizon - t, parallel_plans=False)
                end = time.time()
                building_time = end - start

        # optimize next action
        start = time.time()
        initial_state = tuple(self._batch_tensor(fluent) for fluent in state)
        actions, policy_vars = self._optimizer.run(self.epochs, initial_state, self.show_progress)

        # outputs
        action = tuple(fluent[0] for fluent in actions)
        policy_vars = tuple(np.expand_dims(var[(self.horizon-1) - t], axis=0) for var in policy_vars)
        end = time.time()
        optimization_time = end - start

        return action, policy_vars, building_time, optimization_time

    def _batch_tensor(self, fluent):
        tensor = np.stack([fluent[0]] * self.batch_size)
        if len(tensor.shape) == 1:
            tensor = np.expand_dims(tensor, -1)
        return tensor
Example #5
0
class StochasticPlanner(Planner):
    """StochasticPlanner abstract class implements basic methods for
    online stochastic gradient-based planners.

    Args:
        rddl (str): A RDDL domain/instance filepath or rddlgym id.
        compiler_cls (rddl2tf.Compiler): The RDDL-to-TensorFlow compiler class.
        config (Dict[str, Any]): The planner config dict.
    """

    __metaclass__ = abc.ABCMeta

    def __init__(self, rddl, compiler_cls, config):
        super().__init__(rddl, ReparameterizationCompiler, config)

        self.initial_state = None

        self.steps_to_go = None
        self.sequence_length = None

        self.optimizer = None
        self.grads_and_vars = None

        self.avg_total_reward = None
        self.loss = None

        self.init_op = None
        self.warm_start_op = None
        self.train_op = None

        self.summaries = None

        self.stats = {"loss": pd.DataFrame()}

    def build(self,):
        with self.graph.as_default():
            self._build_policy_ops()
            self._build_initial_state_ops()
            self._build_sequence_length_ops()
            self._build_trajectory_ops()
            self._build_loss_ops()
            self._build_optimization_ops()
            self._build_summary_ops()
            self._build_init_ops()

    @abc.abstractmethod
    def __call__(self, state, timestep):
        raise NotImplementedError

    @abc.abstractmethod
    def _build_policy_ops(self):
        raise NotImplementedError

    @abc.abstractmethod
    def _build_trajectory_ops(self):
        raise NotImplementedError

    @abc.abstractmethod
    def _build_loss_ops(self):
        raise NotImplementedError

    @abc.abstractmethod
    def _build_summary_ops(self):
        raise NotImplementedError

    def _build_init_ops(self):
        self.init_op = tf.global_variables_initializer()

    def _build_initial_state_ops(self):
        with tf.name_scope("initial_state"):
            self.initial_state = tuple(
                tf.placeholder(t.dtype, t.shape) for t in self.compiler.initial_state()
            )

    def _build_sequence_length_ops(self):
        with tf.name_scope("sequence_length"):
            self.steps_to_go = tf.placeholder(tf.int32, shape=())
            self.sequence_length = tf.tile(
                tf.reshape(self.steps_to_go, [1]), [self.batch_size]
            )

    def _build_optimization_ops(self):
        with tf.name_scope("optimization"):
            self.optimizer = ActionOptimizer(self.config["optimization"])
            self.optimizer.build()
            self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
            self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

    def _get_batch_initial_state(self, state):
        batch_size = self.compiler.batch_size
        return tuple(
            map(
                lambda fluent: np.tile(
                    fluent, (batch_size, *([1] * len(fluent.shape)))
                ),
                state.values(),
            )
        )

    def _get_action(self, actions, feed_dict):
        action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering
        actions = self._sess.run(actions, feed_dict=feed_dict)
        action = collections.OrderedDict(
            {
                name: fluent[0][0]
                for name, fluent in zip(action_fluent_ordering, actions)
            }
        )
        return action

    @property
    def horizon(self):
        horizon = self.config["horizon"]
        if self.config.get("planning_horizon"):
            horizon = min(horizon, self.config["planning_horizon"])
        return horizon

    def epochs(self, timestep):
        if self.config.get("epoch_scheduler"):
            scheduler = EpochScheduler(*self.config["epoch_scheduler"])
            epochs = scheduler(timestep)
        else:
            epochs = self.config["epochs"]
        return epochs

    def run(self, timestep, feed_dict):
        if timestep == 0 or not self.warm_start_op:
            self._sess.run(self.init_op)
        else:
            if self.warm_start_op:
                self._sess.run(self.warm_start_op)

        if self.summaries:
            logdir = os.path.join(self.config.get("logdir"), f"timestep={timestep}")
            writer = tf.compat.v1.summary.FileWriter(logdir)

        run_id = self.config.get("run_id", 0)
        pid = os.getpid()
        position = run_id % self.config.get("num_workers", 1)
        epochs = self.epochs(timestep)
        desc = f"(pid={pid}) Run #{run_id:<3d} / step={timestep:<3d}"

        with trange(
            epochs, desc=desc, unit="epoch", position=position, leave=False
        ) as t:

            losses = []

            loss_ = self._sess.run(self.loss, feed_dict=feed_dict)
            losses.append(loss_)

            for step in t:
                self._sess.run(self.train_op, feed_dict=feed_dict)

                loss_, avg_total_reward_ = self._sess.run(
                    [self.loss, self.avg_total_reward], feed_dict=feed_dict
                )

                losses.append(loss_)

                if self.summaries:
                    summary_ = self._sess.run(self.summaries, feed_dict=feed_dict)
                    writer.add_summary(summary_, step)

                t.set_postfix(
                    loss=f"{loss_:10.4f}", avg_total_reward=f"{avg_total_reward_:10.4f}"
                )

            self.stats["loss"][timestep] = pd.Series(losses)

        if self.summaries:
            writer.close()

    def save_stats(self):
        for key, value in self.stats.items():
            filepath = os.path.join(self.config["logdir"], f"{key}.csv")
            value.to_csv(filepath, index=False)
Example #6
0
 def _build_optimization_ops(self):
     with tf.name_scope("optimization"):
         self.optimizer = ActionOptimizer(self.config["optimization"])
         self.optimizer.build()
         self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
         self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
Example #7
0
class Tensorplan(Planner):
    """Tensorplan class implements the Planner interface
    for the offline gradient-based planner (i.e., tensorplan).

    Args:
        model (pyrddl.rddl.RDDL): A RDDL model.
        config (Dict[str, Any]): The planner config dict.
    """

    # pylint: disable=too-many-instance-attributes

    def __init__(self, rddl, config):
        super(Tensorplan, self).__init__(rddl, DefaultCompiler, config)

        self.policy = None
        self.initial_state = None

        self.simulator = None
        self.trajectory = None
        self.final_state = None
        self.total_reward = None

        self.avg_total_reward = None
        self.loss = None

        self.optimizer = None
        self.train_op = None

        self.best_plan_idx = None
        self.best_plan = None

        self._plan = None

        self.writer = None
        self.summaries = None

    @property
    def logdir(self):
        return self.config.get(
            "logdir") or f"/tmp/tfplan/tensorplan/{self.rddl}"

    def build(self):
        """Builds planner ops."""
        with self.graph.as_default():
            self._build_policy_ops()
            self._build_initial_state_ops()
            self._build_trajectory_ops()
            self._build_loss_ops()
            self._build_optimization_ops()
            self._build_solution_ops()
            self._build_summary_ops()
            self._build_init_ops()

    def _build_init_ops(self):
        self.init_op = tf.global_variables_initializer()

    def _build_policy_ops(self):
        horizon = self.config["horizon"]
        self.policy = OpenLoopPolicy(self.compiler,
                                     horizon,
                                     parallel_plans=True)
        self.policy.build("tensorplan")

    def _build_initial_state_ops(self):
        self.initial_state = self.compiler.initial_state()

    def _build_trajectory_ops(self):
        self.simulator = Simulator(self.compiler, self.policy)
        self.simulator.build()
        self.trajectory, self.final_state, self.total_reward = self.simulator.trajectory(
            self.initial_state)

    def _build_loss_ops(self):
        with tf.name_scope("loss"):
            self.avg_total_reward = tf.reduce_mean(self.total_reward)
            self.loss = tf.square(self.avg_total_reward)

    def _build_optimization_ops(self):
        self.optimizer = ActionOptimizer(self.config["optimization"])
        self.optimizer.build()
        self.train_op = self.optimizer.minimize(self.loss)

    def _build_solution_ops(self):
        self.best_plan_idx = tf.argmax(self.total_reward, axis=0)
        self.best_plan = tuple(action[self.best_plan_idx]
                               for action in self.trajectory.actions)

    def _build_summary_ops(self):
        tf.compat.v1.summary.histogram("total_reward", self.total_reward)
        tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward)
        tf.compat.v1.summary.scalar("loss", self.loss)
        self.summaries = tf.compat.v1.summary.merge_all()

    def run(self):
        """Run the planner for the given number of epochs.

        Returns:
            plan (Sequence(np.ndarray): The best solution plan.
        """
        self.writer = tf.compat.v1.summary.FileWriter(self.logdir, self.graph)

        self._sess.run(self.init_op)

        run_id = self.config.get("run_id", 0)
        pid = os.getpid()
        position = run_id % self.config.get("num_workers", 1)
        epochs = self.config["epochs"]
        desc = f"(pid={pid}) Run #{run_id:<3d}"

        with trange(epochs,
                    desc=desc,
                    unit="epoch",
                    position=position,
                    leave=False) as t:

            for step in t:
                _, loss_, avg_total_reward_, summary_ = self._sess.run([
                    self.train_op, self.loss, self.avg_total_reward,
                    self.summaries
                ])

                self.writer.add_summary(summary_, step)

                t.set_postfix(loss=f"{loss_:10.4f}",
                              avg_total_reward=f"{avg_total_reward_:10.4f}")

        self.writer.close()

        plan_ = self._sess.run(self.best_plan)
        return plan_

    def __call__(self, state, timestep):
        """Returns the action for the given `timestep`."""
        # find plan
        if self._plan is None:
            self._plan = self.run()

        # select action for given timestep
        action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering
        action = OrderedDict({
            name: action[timestep]
            for name, action in zip(action_fluent_ordering, self._plan)
        })
        return action
Example #8
0
 def _build_optimization_ops(self):
     self.optimizer = ActionOptimizer(self.config["optimization"])
     self.optimizer.build()
     self.train_op = self.optimizer.minimize(self.loss)
Example #9
0
 def _build_optimizer_graph(self, learning_rate: int) -> None:
     '''Builds the action optimizer ops.'''
     self._optimizer = ActionOptimizer(self._compiler, self._policy)
     self._optimizer.build(learning_rate, self.batch_size, self.horizon)