Beispiel #1
0
    def _build_scenario_policy_ops(self):
        horizon = self.horizon - 1

        self.scenario_policy = OpenLoopPolicy(
            self.compiler, horizon, parallel_plans=True
        )
        self.scenario_policy.build("scenario_policy")
Beispiel #2
0
def cell(request):
    rddl = request.param
    model = rddlgym.make(rddl, mode=rddlgym.AST)
    compiler = DefaultCompiler(model, batch_size=BATCH_SIZE)
    compiler.init()
    policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True)
    policy.build("tensorplan")
    yield SimulationCell(compiler, policy)
Beispiel #3
0
    def _build_policy_ops(self):
        horizon = self.horizon
        self.policy = OpenLoopPolicy(self.compiler,
                                     horizon,
                                     parallel_plans=False)
        self.policy.build("planning")

        if "warm_start" in self.config:
            self.warm_start_op = self.policy._build_warm_start_op()
Beispiel #4
0
def simulator(request):
    rddl = request.param
    model = rddlgym.make(rddl, mode=rddlgym.AST)

    compiler = ReparameterizationCompiler(model, batch_size=BATCH_SIZE)
    compiler.init()

    policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True)
    policy.build("planning")

    simulator = Simulator(compiler, policy, config=None)
    simulator.build()
    yield simulator
Beispiel #5
0
class OfflineOpenLoopPlanner(object):
    '''OfflineOpenLoopPlanner implements a gradient-based planner that optimizes
    a sequence of actions in an offline setting.

    Note:
        For details please refer to NIPS 2017 paper:
        "Scalable Planning with Tensorflow for Hybrid Nonlinear Domains".

    Args:
        compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler.
        batch_size (int): The size of the batch used in policy simulation.
        horizon (int): The number of timesteps.
    '''
    def __init__(self, compiler: Compiler, batch_size: int,
                 horizon: int) -> None:
        self._compiler = compiler
        self.batch_size = batch_size
        self.horizon = horizon

    def build(self, learning_rate: int) -> None:
        '''Builds the offline open loop planning ops.'''
        self._build_policy_graph()
        self._build_optimizer_graph(learning_rate)

    def _build_policy_graph(self) -> None:
        '''Builds the open loop policy ops.'''
        self._policy = OpenLoopPolicy(self._compiler, self.batch_size,
                                      self.horizon)
        self._policy.build('planning')

    def _build_optimizer_graph(self, learning_rate: int) -> None:
        '''Builds the action optimizer ops.'''
        self._optimizer = ActionOptimizer(self._compiler, self._policy)
        self._optimizer.build(learning_rate, self.batch_size, self.horizon)

    def run(self,
            epochs: int,
            show_progress: bool = True) -> Tuple[ActionArray, PolicyVarsArray]:
        '''Runs action optimizer for the given number of training `epochs`.

        Args:
            epochs (int): The number of training epochs.
            show_progress (bool): The boolean flag for showing current progress.

        Returns:
            Tuple[ActionArray, PolicyVarsArray]: The sequence of actions and
            policy variables optimized after training.
        '''
        actions, policy_vars = self._optimizer.run(epochs,
                                                   show_progress=show_progress)
        return actions, policy_vars
Beispiel #6
0
    def setUpClass(cls):

        # initialize hyper-parameters
        cls.horizon = 40
        cls.batch_size = 64
        cls.epochs = 50
        cls.learning_rate = 0.01

        # parse RDDL file
        with open('rddl/deterministic/Navigation.rddl') as file:
            parser = RDDLParser()
            parser.build()
            rddl = parser.parse(file.read())
            rddl.build()

        # initializer RDDL2TensorFlow compiler
        cls.rddl2tf = Compiler(rddl, batch_mode=True)

        # initialize open-loop policy
        cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon)
        cls.policy.build('test')

        # initialize ActionOptimizer
        cls.optimizer = ActionOptimizer(cls.rddl2tf, cls.policy)
        cls.optimizer.build(cls.learning_rate, cls.batch_size, cls.horizon)
Beispiel #7
0
    def setUpClass(cls):

        # initialize hyper-parameters
        cls.horizon = 40
        cls.batch_size = 64

        # parse RDDL file
        with open('rddl/deterministic/Navigation.rddl') as file:
            parser = RDDLParser()
            parser.build()
            rddl = parser.parse(file.read())
            rddl.build()

        # initializer RDDL2TensorFlow compiler
        cls.rddl2tf = Compiler(rddl, batch_mode=True)

        # initialize open-loop policy
        cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon)
        cls.policy.build('test')

        # execute policy for the given horizon and initial state
        with cls.rddl2tf.graph.as_default():
            cls.state = cls.rddl2tf.compile_initial_state(cls.batch_size)
            cls.actions = []
            for t in range(cls.horizon - 1, -1, -1):
                timestep = tf.constant(t,
                                       dtype=tf.float32,
                                       shape=(cls.batch_size, 1))
                action = cls.policy(cls.state, timestep)
                cls.actions.append(action)
Beispiel #8
0
    def setUpClass(cls):

        # initialize hyper-parameters
        cls.horizon = 40
        cls.batch_size = 1

        # parse RDDL file
        with open('rddl/deterministic/Navigation.rddl') as file:
            parser = RDDLParser()
            parser.build()
            rddl = parser.parse(file.read())
            rddl.build()

        # initializer RDDL2TensorFlow compiler
        cls.rddl2tf = Compiler(rddl, batch_mode=True)

        # initialize open-loop policy
        cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon)
        cls.policy.build('test')

        # sample policy variables to initialize open-loop policy
        cls.policy_variables = []
        for shape in cls.rddl2tf.rddl.action_size:
            size = [cls.horizon] + list(shape)
            cls.policy_variables.append(
                np.random.uniform(low=-1.0, high=1.0, size=size))

        # initialize action evaluator
        cls.evaluator = ActionEvaluator(cls.rddl2tf, cls.policy)
Beispiel #9
0
def cell(request):
    rddl = request.param
    model = rddlgym.make(rddl, mode=rddlgym.AST)

    compiler = ReparameterizationCompiler(model, batch_size=BATCH_SIZE)
    compiler.init()

    policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True)
    policy.build("planning")

    with compiler.graph.as_default():
        reparameterization_map = compiler.get_cpfs_reparameterization()
        cell_samples = utils.get_noise_samples(reparameterization_map,
                                               BATCH_SIZE,
                                               horizon=1)
        cell_noise, encoding = utils.encode_noise_samples_as_inputs(
            cell_samples)

    cell = SimulationCell(compiler, policy, config={"encoding": encoding})
    cell.cell_noise = cell_noise
    yield cell
Beispiel #10
0
def non_parallel_plans(compiler):
    policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=False)
    policy.build("non_parallel_plans")
    return policy
Beispiel #11
0
def parallel_plans(compiler):
    policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True)
    policy.build("parallel_plans")
    return policy
Beispiel #12
0
 def _build_policy_ops(self):
     horizon = self.config["horizon"]
     self.policy = OpenLoopPolicy(self.compiler,
                                  horizon,
                                  parallel_plans=True)
     self.policy.build("tensorplan")
Beispiel #13
0
 def _build_base_policy_ops(self):
     horizon = 1
     self.base_policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False)
     self.base_policy.build("base_policy")
Beispiel #14
0
class StraightLinePlanner(StochasticPlanner):
    """StraightLinePlanner class implements the online gradient-based
    planner that chooses the next action based on the lower bound of
    the Value function of the start state.

    Args:
        rddl (str): A RDDL domain/instance filepath or rddlgym id.
        config (Dict[str, Any]): The planner config dict.
    """

    # pylint: disable=too-many-instance-attributes

    def __init__(self, rddl, config):
        super().__init__(rddl, ReparameterizationCompiler, config)

        self.policy = None

        self.simulator = None
        self.trajectory = None
        self.final_state = None
        self.total_reward = None

        self.avg_total_reward = None
        self.loss = None

        self.writer = None
        self.summaries = None

    def _build_policy_ops(self):
        horizon = self.horizon
        self.policy = OpenLoopPolicy(self.compiler,
                                     horizon,
                                     parallel_plans=False)
        self.policy.build("planning")

        if "warm_start" in self.config:
            self.warm_start_op = self.policy._build_warm_start_op()

    def _build_trajectory_ops(self):
        with tf.name_scope("scenarios"):
            self.simulator = Simulator(self.compiler, self.policy, config=None)
            self.simulator.build()
            (
                self.trajectory,
                self.final_state,
                self.total_reward,
            ) = self.simulator.trajectory(self.initial_state,
                                          self.sequence_length)

    def _build_loss_ops(self):
        with tf.name_scope("loss"):
            self.avg_total_reward = tf.reduce_mean(self.total_reward)
            self.loss = tf.square(self.avg_total_reward)

    def _build_summary_ops(self):
        if self.config["verbose"]:

            with tf.name_scope("summary"):
                _ = tf.compat.v1.summary.FileWriter(self.config["logdir"],
                                                    self.graph)
                tf.compat.v1.summary.scalar("avg_total_reward",
                                            self.avg_total_reward)
                tf.compat.v1.summary.scalar("loss", self.loss)

                tf.compat.v1.summary.histogram("total_reward",
                                               self.total_reward)
                tf.compat.v1.summary.histogram("scenario_noise",
                                               self.simulator.noise)

                for grad, variable in self.grads_and_vars:
                    var_name = variable.name
                    tf.compat.v1.summary.histogram(f"{var_name}_grad", grad)
                    tf.compat.v1.summary.histogram(var_name, variable)

                self.summaries = tf.compat.v1.summary.merge_all()

    def __call__(self, state, timestep):
        scenario_noise = utils.evaluate_noise_samples_as_inputs(
            self._sess, self.simulator.samples)

        steps_to_go = self.config["horizon"] - timestep
        if self.config.get("planning_horizon"):
            steps_to_go = min(steps_to_go, self.config["planning_horizon"])

        feed_dict = {
            self.initial_state: self._get_batch_initial_state(state),
            self.simulator.noise: scenario_noise,
            self.steps_to_go: steps_to_go,
        }

        self.run(timestep, feed_dict)

        action = self._get_action(self.trajectory.actions, feed_dict)
        return action
Beispiel #15
0
class OnlineOpenLoopPlanner(object):
    '''OnlineOpenLoopPlanner implements a gradient-based planner that optimizes
    a sequence of actions in an online setting (i.e., interleaving planning and
    execution).

    Args:
        compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler.
        batch_size (int): The size of the batch used in policy simulation.
        horizon (int): The number of timesteps.
        parallel_plans (bool): The boolean flag for optimizing parallel sequence of actions.
    '''

    def __init__(self,
            compiler: Compiler,
            batch_size: int,
            horizon: int,
            parallel_plans: bool = True) -> None:
        self._compiler = compiler
        self.batch_size = batch_size
        self.horizon = horizon
        self.parallel_plans = parallel_plans

    def build(self,
            learning_rate: int,
            epochs: int,
            show_progress: bool = True) -> None:
        '''Builds the online open loop planning ops.'''
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.show_progress = show_progress
        self._build_policy_graph()
        self._build_optimizer_graph()

    def _build_policy_graph(self) -> None:
        '''Builds the open loop policy ops.'''
        self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans)
        self._policy.build('planning')

    def _build_optimizer_graph(self) -> None:
        '''Builds the action optimizer ops.'''
        self._optimizer = ActionOptimizer(self._compiler, self._policy)

    def __call__(self, state: StateTensor, t: int) -> Tuple[Action, PolicyVars]:
        '''Returns action to be executed in current `state` at timestep `t`.

        Args:
            state (StateTensor): The current state.
            t (int): The current timestep.

        Returns:
            Tuple[ActionArray, PolicyVarsArray]: The action and
            policy variables optimized for the current timestep.
        '''

        # initialize action optimizer
        with self._compiler.graph.as_default():
            with tf.name_scope('timestep{}'.format(t)):
                start = time.time()
                self._optimizer.build(self.learning_rate, self.batch_size, self.horizon - t, parallel_plans=False)
                end = time.time()
                building_time = end - start

        # optimize next action
        start = time.time()
        initial_state = tuple(self._batch_tensor(fluent) for fluent in state)
        actions, policy_vars = self._optimizer.run(self.epochs, initial_state, self.show_progress)

        # outputs
        action = tuple(fluent[0] for fluent in actions)
        policy_vars = tuple(np.expand_dims(var[(self.horizon-1) - t], axis=0) for var in policy_vars)
        end = time.time()
        optimization_time = end - start

        return action, policy_vars, building_time, optimization_time

    def _batch_tensor(self, fluent):
        tensor = np.stack([fluent[0]] * self.batch_size)
        if len(tensor.shape) == 1:
            tensor = np.expand_dims(tensor, -1)
        return tensor
Beispiel #16
0
class Tensorplan(Planner):
    """Tensorplan class implements the Planner interface
    for the offline gradient-based planner (i.e., tensorplan).

    Args:
        model (pyrddl.rddl.RDDL): A RDDL model.
        config (Dict[str, Any]): The planner config dict.
    """

    # pylint: disable=too-many-instance-attributes

    def __init__(self, rddl, config):
        super(Tensorplan, self).__init__(rddl, DefaultCompiler, config)

        self.policy = None
        self.initial_state = None

        self.simulator = None
        self.trajectory = None
        self.final_state = None
        self.total_reward = None

        self.avg_total_reward = None
        self.loss = None

        self.optimizer = None
        self.train_op = None

        self.best_plan_idx = None
        self.best_plan = None

        self._plan = None

        self.writer = None
        self.summaries = None

    @property
    def logdir(self):
        return self.config.get(
            "logdir") or f"/tmp/tfplan/tensorplan/{self.rddl}"

    def build(self):
        """Builds planner ops."""
        with self.graph.as_default():
            self._build_policy_ops()
            self._build_initial_state_ops()
            self._build_trajectory_ops()
            self._build_loss_ops()
            self._build_optimization_ops()
            self._build_solution_ops()
            self._build_summary_ops()
            self._build_init_ops()

    def _build_init_ops(self):
        self.init_op = tf.global_variables_initializer()

    def _build_policy_ops(self):
        horizon = self.config["horizon"]
        self.policy = OpenLoopPolicy(self.compiler,
                                     horizon,
                                     parallel_plans=True)
        self.policy.build("tensorplan")

    def _build_initial_state_ops(self):
        self.initial_state = self.compiler.initial_state()

    def _build_trajectory_ops(self):
        self.simulator = Simulator(self.compiler, self.policy)
        self.simulator.build()
        self.trajectory, self.final_state, self.total_reward = self.simulator.trajectory(
            self.initial_state)

    def _build_loss_ops(self):
        with tf.name_scope("loss"):
            self.avg_total_reward = tf.reduce_mean(self.total_reward)
            self.loss = tf.square(self.avg_total_reward)

    def _build_optimization_ops(self):
        self.optimizer = ActionOptimizer(self.config["optimization"])
        self.optimizer.build()
        self.train_op = self.optimizer.minimize(self.loss)

    def _build_solution_ops(self):
        self.best_plan_idx = tf.argmax(self.total_reward, axis=0)
        self.best_plan = tuple(action[self.best_plan_idx]
                               for action in self.trajectory.actions)

    def _build_summary_ops(self):
        tf.compat.v1.summary.histogram("total_reward", self.total_reward)
        tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward)
        tf.compat.v1.summary.scalar("loss", self.loss)
        self.summaries = tf.compat.v1.summary.merge_all()

    def run(self):
        """Run the planner for the given number of epochs.

        Returns:
            plan (Sequence(np.ndarray): The best solution plan.
        """
        self.writer = tf.compat.v1.summary.FileWriter(self.logdir, self.graph)

        self._sess.run(self.init_op)

        run_id = self.config.get("run_id", 0)
        pid = os.getpid()
        position = run_id % self.config.get("num_workers", 1)
        epochs = self.config["epochs"]
        desc = f"(pid={pid}) Run #{run_id:<3d}"

        with trange(epochs,
                    desc=desc,
                    unit="epoch",
                    position=position,
                    leave=False) as t:

            for step in t:
                _, loss_, avg_total_reward_, summary_ = self._sess.run([
                    self.train_op, self.loss, self.avg_total_reward,
                    self.summaries
                ])

                self.writer.add_summary(summary_, step)

                t.set_postfix(loss=f"{loss_:10.4f}",
                              avg_total_reward=f"{avg_total_reward_:10.4f}")

        self.writer.close()

        plan_ = self._sess.run(self.best_plan)
        return plan_

    def __call__(self, state, timestep):
        """Returns the action for the given `timestep`."""
        # find plan
        if self._plan is None:
            self._plan = self.run()

        # select action for given timestep
        action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering
        action = OrderedDict({
            name: action[timestep]
            for name, action in zip(action_fluent_ordering, self._plan)
        })
        return action
Beispiel #17
0
 def _build_policy_graph(self) -> None:
     '''Builds the open loop policy ops.'''
     self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans)
     self._policy.build('planning')
Beispiel #18
0
class HindsightPlanner(StochasticPlanner):
    """HindsightPlanner class implements an online gradient-based
    planner that chooses the next action based on the upper bound of
    the Value function of the current state.

    Args:
        rddl (str): A RDDL domain/instance filepath or rddlgym id.
        config (Dict[str, Any]): The planner config dict.
    """

    # pylint: disable=too-many-instance-attributes

    def __init__(self, rddl, config):
        super().__init__(rddl, ReparameterizationCompiler, config)

        self.base_policy = None
        self.scenario_policy = None

        self.next_state = None

        self.cell = None
        self.cell_noise = None
        self.cell_samples = None
        self.action = None
        self.reward = None

        self.simulator = None
        self.trajectory = None
        self.final_state = None
        self.scenario_total_reward = None
        self.total_reward = None

        self.avg_total_reward = None
        self.loss = None

        self.writer = None
        self.summaries = None

    def _build_policy_ops(self):
        self._build_base_policy_ops()
        self._build_scenario_policy_ops()

        if "warm_start" in self.config:
            warm_start_base_policy = tf.group(*[
                tf.assign(var1[:, 0, :],
                          tf.reduce_mean(var2[:, 0, :], axis=0, keepdims=True))
                for var1, var2 in zip(
                    self.base_policy._policy_variables,
                    self.scenario_policy._policy_variables)
            ])
            with tf.control_dependencies([warm_start_base_policy]):
                warm_start_scenario_policy = self.scenario_policy._build_warm_start_op()
            self.warm_start_op = tf.group(warm_start_base_policy,
                                          warm_start_scenario_policy)

    def _build_base_policy_ops(self):
        horizon = 1
        self.base_policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False)
        self.base_policy.build("base_policy")

    def _build_scenario_policy_ops(self):
        horizon = self.horizon - 1

        self.scenario_policy = OpenLoopPolicy(
            self.compiler, horizon, parallel_plans=True
        )
        self.scenario_policy.build("scenario_policy")

    def _build_trajectory_ops(self):
        self._build_scenario_start_states_ops()
        self._build_scenario_trajectory_ops()

    def _build_scenario_start_states_ops(self):
        with tf.name_scope("current_action"):

            with tf.name_scope("reparameterization"):
                reparameterization_map = self.compiler.get_cpfs_reparameterization()
                self.cell_samples = utils.get_noise_samples(
                    reparameterization_map, self.batch_size, horizon=1
                )
                self.cell_noise, encoding = utils.encode_noise_samples_as_inputs(
                    self.cell_samples
                )

            self.cell = SimulationCell(
                self.compiler, self.base_policy, config={"encoding": encoding}
            )

            timesteps = tf.zeros((self.batch_size, 1), dtype=tf.float32)

            inputs = tf.concat([timesteps, self.cell_noise[:, 0, ...]], axis=1)
            output, self.next_state = self.cell(inputs, self.initial_state)

            self.action = output[1]
            self.reward = tf.squeeze(output[3])

    def _build_scenario_trajectory_ops(self):
        with tf.name_scope("scenarios"):
            self.simulator = Simulator(self.compiler, self.scenario_policy, config=None)
            self.simulator.build()
            (
                self.trajectory,
                self.final_state,
                self.scenario_total_reward,
            ) = self.simulator.trajectory(self.next_state, self.sequence_length)

    def _build_loss_ops(self):
        with tf.name_scope("loss"):
            self.total_reward = self.reward + self.scenario_total_reward
            self.avg_total_reward = tf.reduce_mean(self.total_reward)
            self.loss = tf.square(self.avg_total_reward)

    def _build_summary_ops(self):
        if self.config["verbose"]:

            with tf.name_scope("summary"):
                _ = tf.compat.v1.summary.FileWriter(self.config["logdir"], self.graph)
                tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward)
                tf.compat.v1.summary.scalar("loss", self.loss)

                tf.compat.v1.summary.histogram("reward", self.reward)
                tf.compat.v1.summary.histogram(
                    "scenario_total_reward", self.scenario_total_reward
                )
                tf.compat.v1.summary.histogram("total_reward", self.total_reward)
                tf.compat.v1.summary.histogram("next_state_noise", self.cell_noise)
                tf.compat.v1.summary.histogram("scenario_noise", self.simulator.noise)

                for grad, variable in self.grads_and_vars:
                    var_name = variable.name
                    tf.compat.v1.summary.histogram(f"{var_name}_grad", grad)
                    tf.compat.v1.summary.histogram(var_name, variable)

                self.summaries = tf.compat.v1.summary.merge_all()

    def __call__(self, state, timestep):
        next_state_noise = utils.evaluate_noise_samples_as_inputs(
            self._sess, self.cell_samples
        )
        scenario_noise = utils.evaluate_noise_samples_as_inputs(
            self._sess, self.simulator.samples
        )

        steps_to_go = self.config["horizon"] - timestep
        if self.config.get("planning_horizon"):
            steps_to_go = min(steps_to_go, self.config["planning_horizon"])
        steps_to_go -= 1

        feed_dict = {
            self.initial_state: self._get_batch_initial_state(state),
            self.cell_noise: next_state_noise,
            self.simulator.noise: scenario_noise,
            self.steps_to_go: steps_to_go
        }

        self.run(timestep, feed_dict)

        action = self._get_action(self.action, feed_dict)
        return action