class OfflineOpenLoopPlanner(object): '''OfflineOpenLoopPlanner implements a gradient-based planner that optimizes a sequence of actions in an offline setting. Note: For details please refer to NIPS 2017 paper: "Scalable Planning with Tensorflow for Hybrid Nonlinear Domains". Args: compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler. batch_size (int): The size of the batch used in policy simulation. horizon (int): The number of timesteps. ''' def __init__(self, compiler: Compiler, batch_size: int, horizon: int) -> None: self._compiler = compiler self.batch_size = batch_size self.horizon = horizon def build(self, learning_rate: int) -> None: '''Builds the offline open loop planning ops.''' self._build_policy_graph() self._build_optimizer_graph(learning_rate) def _build_policy_graph(self) -> None: '''Builds the open loop policy ops.''' self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon) self._policy.build('planning') def _build_optimizer_graph(self, learning_rate: int) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy) self._optimizer.build(learning_rate, self.batch_size, self.horizon) def run(self, epochs: int, show_progress: bool = True) -> Tuple[ActionArray, PolicyVarsArray]: '''Runs action optimizer for the given number of training `epochs`. Args: epochs (int): The number of training epochs. show_progress (bool): The boolean flag for showing current progress. Returns: Tuple[ActionArray, PolicyVarsArray]: The sequence of actions and policy variables optimized after training. ''' actions, policy_vars = self._optimizer.run(epochs, show_progress=show_progress) return actions, policy_vars
def setUpClass(cls): # initialize hyper-parameters cls.horizon = 40 cls.batch_size = 64 cls.epochs = 50 cls.learning_rate = 0.01 # parse RDDL file with open('rddl/deterministic/Navigation.rddl') as file: parser = RDDLParser() parser.build() rddl = parser.parse(file.read()) rddl.build() # initializer RDDL2TensorFlow compiler cls.rddl2tf = Compiler(rddl, batch_mode=True) # initialize open-loop policy cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon) cls.policy.build('test') # initialize ActionOptimizer cls.optimizer = ActionOptimizer(cls.rddl2tf, cls.policy) cls.optimizer.build(cls.learning_rate, cls.batch_size, cls.horizon)
def _build_optimizer_graph(self) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy)
class OnlineOpenLoopPlanner(object): '''OnlineOpenLoopPlanner implements a gradient-based planner that optimizes a sequence of actions in an online setting (i.e., interleaving planning and execution). Args: compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler. batch_size (int): The size of the batch used in policy simulation. horizon (int): The number of timesteps. parallel_plans (bool): The boolean flag for optimizing parallel sequence of actions. ''' def __init__(self, compiler: Compiler, batch_size: int, horizon: int, parallel_plans: bool = True) -> None: self._compiler = compiler self.batch_size = batch_size self.horizon = horizon self.parallel_plans = parallel_plans def build(self, learning_rate: int, epochs: int, show_progress: bool = True) -> None: '''Builds the online open loop planning ops.''' self.learning_rate = learning_rate self.epochs = epochs self.show_progress = show_progress self._build_policy_graph() self._build_optimizer_graph() def _build_policy_graph(self) -> None: '''Builds the open loop policy ops.''' self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans) self._policy.build('planning') def _build_optimizer_graph(self) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy) def __call__(self, state: StateTensor, t: int) -> Tuple[Action, PolicyVars]: '''Returns action to be executed in current `state` at timestep `t`. Args: state (StateTensor): The current state. t (int): The current timestep. Returns: Tuple[ActionArray, PolicyVarsArray]: The action and policy variables optimized for the current timestep. ''' # initialize action optimizer with self._compiler.graph.as_default(): with tf.name_scope('timestep{}'.format(t)): start = time.time() self._optimizer.build(self.learning_rate, self.batch_size, self.horizon - t, parallel_plans=False) end = time.time() building_time = end - start # optimize next action start = time.time() initial_state = tuple(self._batch_tensor(fluent) for fluent in state) actions, policy_vars = self._optimizer.run(self.epochs, initial_state, self.show_progress) # outputs action = tuple(fluent[0] for fluent in actions) policy_vars = tuple(np.expand_dims(var[(self.horizon-1) - t], axis=0) for var in policy_vars) end = time.time() optimization_time = end - start return action, policy_vars, building_time, optimization_time def _batch_tensor(self, fluent): tensor = np.stack([fluent[0]] * self.batch_size) if len(tensor.shape) == 1: tensor = np.expand_dims(tensor, -1) return tensor
class StochasticPlanner(Planner): """StochasticPlanner abstract class implements basic methods for online stochastic gradient-based planners. Args: rddl (str): A RDDL domain/instance filepath or rddlgym id. compiler_cls (rddl2tf.Compiler): The RDDL-to-TensorFlow compiler class. config (Dict[str, Any]): The planner config dict. """ __metaclass__ = abc.ABCMeta def __init__(self, rddl, compiler_cls, config): super().__init__(rddl, ReparameterizationCompiler, config) self.initial_state = None self.steps_to_go = None self.sequence_length = None self.optimizer = None self.grads_and_vars = None self.avg_total_reward = None self.loss = None self.init_op = None self.warm_start_op = None self.train_op = None self.summaries = None self.stats = {"loss": pd.DataFrame()} def build(self,): with self.graph.as_default(): self._build_policy_ops() self._build_initial_state_ops() self._build_sequence_length_ops() self._build_trajectory_ops() self._build_loss_ops() self._build_optimization_ops() self._build_summary_ops() self._build_init_ops() @abc.abstractmethod def __call__(self, state, timestep): raise NotImplementedError @abc.abstractmethod def _build_policy_ops(self): raise NotImplementedError @abc.abstractmethod def _build_trajectory_ops(self): raise NotImplementedError @abc.abstractmethod def _build_loss_ops(self): raise NotImplementedError @abc.abstractmethod def _build_summary_ops(self): raise NotImplementedError def _build_init_ops(self): self.init_op = tf.global_variables_initializer() def _build_initial_state_ops(self): with tf.name_scope("initial_state"): self.initial_state = tuple( tf.placeholder(t.dtype, t.shape) for t in self.compiler.initial_state() ) def _build_sequence_length_ops(self): with tf.name_scope("sequence_length"): self.steps_to_go = tf.placeholder(tf.int32, shape=()) self.sequence_length = tf.tile( tf.reshape(self.steps_to_go, [1]), [self.batch_size] ) def _build_optimization_ops(self): with tf.name_scope("optimization"): self.optimizer = ActionOptimizer(self.config["optimization"]) self.optimizer.build() self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.train_op = self.optimizer.apply_gradients(self.grads_and_vars) def _get_batch_initial_state(self, state): batch_size = self.compiler.batch_size return tuple( map( lambda fluent: np.tile( fluent, (batch_size, *([1] * len(fluent.shape))) ), state.values(), ) ) def _get_action(self, actions, feed_dict): action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering actions = self._sess.run(actions, feed_dict=feed_dict) action = collections.OrderedDict( { name: fluent[0][0] for name, fluent in zip(action_fluent_ordering, actions) } ) return action @property def horizon(self): horizon = self.config["horizon"] if self.config.get("planning_horizon"): horizon = min(horizon, self.config["planning_horizon"]) return horizon def epochs(self, timestep): if self.config.get("epoch_scheduler"): scheduler = EpochScheduler(*self.config["epoch_scheduler"]) epochs = scheduler(timestep) else: epochs = self.config["epochs"] return epochs def run(self, timestep, feed_dict): if timestep == 0 or not self.warm_start_op: self._sess.run(self.init_op) else: if self.warm_start_op: self._sess.run(self.warm_start_op) if self.summaries: logdir = os.path.join(self.config.get("logdir"), f"timestep={timestep}") writer = tf.compat.v1.summary.FileWriter(logdir) run_id = self.config.get("run_id", 0) pid = os.getpid() position = run_id % self.config.get("num_workers", 1) epochs = self.epochs(timestep) desc = f"(pid={pid}) Run #{run_id:<3d} / step={timestep:<3d}" with trange( epochs, desc=desc, unit="epoch", position=position, leave=False ) as t: losses = [] loss_ = self._sess.run(self.loss, feed_dict=feed_dict) losses.append(loss_) for step in t: self._sess.run(self.train_op, feed_dict=feed_dict) loss_, avg_total_reward_ = self._sess.run( [self.loss, self.avg_total_reward], feed_dict=feed_dict ) losses.append(loss_) if self.summaries: summary_ = self._sess.run(self.summaries, feed_dict=feed_dict) writer.add_summary(summary_, step) t.set_postfix( loss=f"{loss_:10.4f}", avg_total_reward=f"{avg_total_reward_:10.4f}" ) self.stats["loss"][timestep] = pd.Series(losses) if self.summaries: writer.close() def save_stats(self): for key, value in self.stats.items(): filepath = os.path.join(self.config["logdir"], f"{key}.csv") value.to_csv(filepath, index=False)
def _build_optimization_ops(self): with tf.name_scope("optimization"): self.optimizer = ActionOptimizer(self.config["optimization"]) self.optimizer.build() self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
class Tensorplan(Planner): """Tensorplan class implements the Planner interface for the offline gradient-based planner (i.e., tensorplan). Args: model (pyrddl.rddl.RDDL): A RDDL model. config (Dict[str, Any]): The planner config dict. """ # pylint: disable=too-many-instance-attributes def __init__(self, rddl, config): super(Tensorplan, self).__init__(rddl, DefaultCompiler, config) self.policy = None self.initial_state = None self.simulator = None self.trajectory = None self.final_state = None self.total_reward = None self.avg_total_reward = None self.loss = None self.optimizer = None self.train_op = None self.best_plan_idx = None self.best_plan = None self._plan = None self.writer = None self.summaries = None @property def logdir(self): return self.config.get( "logdir") or f"/tmp/tfplan/tensorplan/{self.rddl}" def build(self): """Builds planner ops.""" with self.graph.as_default(): self._build_policy_ops() self._build_initial_state_ops() self._build_trajectory_ops() self._build_loss_ops() self._build_optimization_ops() self._build_solution_ops() self._build_summary_ops() self._build_init_ops() def _build_init_ops(self): self.init_op = tf.global_variables_initializer() def _build_policy_ops(self): horizon = self.config["horizon"] self.policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=True) self.policy.build("tensorplan") def _build_initial_state_ops(self): self.initial_state = self.compiler.initial_state() def _build_trajectory_ops(self): self.simulator = Simulator(self.compiler, self.policy) self.simulator.build() self.trajectory, self.final_state, self.total_reward = self.simulator.trajectory( self.initial_state) def _build_loss_ops(self): with tf.name_scope("loss"): self.avg_total_reward = tf.reduce_mean(self.total_reward) self.loss = tf.square(self.avg_total_reward) def _build_optimization_ops(self): self.optimizer = ActionOptimizer(self.config["optimization"]) self.optimizer.build() self.train_op = self.optimizer.minimize(self.loss) def _build_solution_ops(self): self.best_plan_idx = tf.argmax(self.total_reward, axis=0) self.best_plan = tuple(action[self.best_plan_idx] for action in self.trajectory.actions) def _build_summary_ops(self): tf.compat.v1.summary.histogram("total_reward", self.total_reward) tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward) tf.compat.v1.summary.scalar("loss", self.loss) self.summaries = tf.compat.v1.summary.merge_all() def run(self): """Run the planner for the given number of epochs. Returns: plan (Sequence(np.ndarray): The best solution plan. """ self.writer = tf.compat.v1.summary.FileWriter(self.logdir, self.graph) self._sess.run(self.init_op) run_id = self.config.get("run_id", 0) pid = os.getpid() position = run_id % self.config.get("num_workers", 1) epochs = self.config["epochs"] desc = f"(pid={pid}) Run #{run_id:<3d}" with trange(epochs, desc=desc, unit="epoch", position=position, leave=False) as t: for step in t: _, loss_, avg_total_reward_, summary_ = self._sess.run([ self.train_op, self.loss, self.avg_total_reward, self.summaries ]) self.writer.add_summary(summary_, step) t.set_postfix(loss=f"{loss_:10.4f}", avg_total_reward=f"{avg_total_reward_:10.4f}") self.writer.close() plan_ = self._sess.run(self.best_plan) return plan_ def __call__(self, state, timestep): """Returns the action for the given `timestep`.""" # find plan if self._plan is None: self._plan = self.run() # select action for given timestep action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering action = OrderedDict({ name: action[timestep] for name, action in zip(action_fluent_ordering, self._plan) }) return action
def _build_optimization_ops(self): self.optimizer = ActionOptimizer(self.config["optimization"]) self.optimizer.build() self.train_op = self.optimizer.minimize(self.loss)
def _build_optimizer_graph(self, learning_rate: int) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy) self._optimizer.build(learning_rate, self.batch_size, self.horizon)