def _build_scenario_policy_ops(self): horizon = self.horizon - 1 self.scenario_policy = OpenLoopPolicy( self.compiler, horizon, parallel_plans=True ) self.scenario_policy.build("scenario_policy")
def cell(request): rddl = request.param model = rddlgym.make(rddl, mode=rddlgym.AST) compiler = DefaultCompiler(model, batch_size=BATCH_SIZE) compiler.init() policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True) policy.build("tensorplan") yield SimulationCell(compiler, policy)
def _build_policy_ops(self): horizon = self.horizon self.policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False) self.policy.build("planning") if "warm_start" in self.config: self.warm_start_op = self.policy._build_warm_start_op()
def simulator(request): rddl = request.param model = rddlgym.make(rddl, mode=rddlgym.AST) compiler = ReparameterizationCompiler(model, batch_size=BATCH_SIZE) compiler.init() policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True) policy.build("planning") simulator = Simulator(compiler, policy, config=None) simulator.build() yield simulator
class OfflineOpenLoopPlanner(object): '''OfflineOpenLoopPlanner implements a gradient-based planner that optimizes a sequence of actions in an offline setting. Note: For details please refer to NIPS 2017 paper: "Scalable Planning with Tensorflow for Hybrid Nonlinear Domains". Args: compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler. batch_size (int): The size of the batch used in policy simulation. horizon (int): The number of timesteps. ''' def __init__(self, compiler: Compiler, batch_size: int, horizon: int) -> None: self._compiler = compiler self.batch_size = batch_size self.horizon = horizon def build(self, learning_rate: int) -> None: '''Builds the offline open loop planning ops.''' self._build_policy_graph() self._build_optimizer_graph(learning_rate) def _build_policy_graph(self) -> None: '''Builds the open loop policy ops.''' self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon) self._policy.build('planning') def _build_optimizer_graph(self, learning_rate: int) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy) self._optimizer.build(learning_rate, self.batch_size, self.horizon) def run(self, epochs: int, show_progress: bool = True) -> Tuple[ActionArray, PolicyVarsArray]: '''Runs action optimizer for the given number of training `epochs`. Args: epochs (int): The number of training epochs. show_progress (bool): The boolean flag for showing current progress. Returns: Tuple[ActionArray, PolicyVarsArray]: The sequence of actions and policy variables optimized after training. ''' actions, policy_vars = self._optimizer.run(epochs, show_progress=show_progress) return actions, policy_vars
def setUpClass(cls): # initialize hyper-parameters cls.horizon = 40 cls.batch_size = 64 cls.epochs = 50 cls.learning_rate = 0.01 # parse RDDL file with open('rddl/deterministic/Navigation.rddl') as file: parser = RDDLParser() parser.build() rddl = parser.parse(file.read()) rddl.build() # initializer RDDL2TensorFlow compiler cls.rddl2tf = Compiler(rddl, batch_mode=True) # initialize open-loop policy cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon) cls.policy.build('test') # initialize ActionOptimizer cls.optimizer = ActionOptimizer(cls.rddl2tf, cls.policy) cls.optimizer.build(cls.learning_rate, cls.batch_size, cls.horizon)
def setUpClass(cls): # initialize hyper-parameters cls.horizon = 40 cls.batch_size = 64 # parse RDDL file with open('rddl/deterministic/Navigation.rddl') as file: parser = RDDLParser() parser.build() rddl = parser.parse(file.read()) rddl.build() # initializer RDDL2TensorFlow compiler cls.rddl2tf = Compiler(rddl, batch_mode=True) # initialize open-loop policy cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon) cls.policy.build('test') # execute policy for the given horizon and initial state with cls.rddl2tf.graph.as_default(): cls.state = cls.rddl2tf.compile_initial_state(cls.batch_size) cls.actions = [] for t in range(cls.horizon - 1, -1, -1): timestep = tf.constant(t, dtype=tf.float32, shape=(cls.batch_size, 1)) action = cls.policy(cls.state, timestep) cls.actions.append(action)
def setUpClass(cls): # initialize hyper-parameters cls.horizon = 40 cls.batch_size = 1 # parse RDDL file with open('rddl/deterministic/Navigation.rddl') as file: parser = RDDLParser() parser.build() rddl = parser.parse(file.read()) rddl.build() # initializer RDDL2TensorFlow compiler cls.rddl2tf = Compiler(rddl, batch_mode=True) # initialize open-loop policy cls.policy = OpenLoopPolicy(cls.rddl2tf, cls.batch_size, cls.horizon) cls.policy.build('test') # sample policy variables to initialize open-loop policy cls.policy_variables = [] for shape in cls.rddl2tf.rddl.action_size: size = [cls.horizon] + list(shape) cls.policy_variables.append( np.random.uniform(low=-1.0, high=1.0, size=size)) # initialize action evaluator cls.evaluator = ActionEvaluator(cls.rddl2tf, cls.policy)
def cell(request): rddl = request.param model = rddlgym.make(rddl, mode=rddlgym.AST) compiler = ReparameterizationCompiler(model, batch_size=BATCH_SIZE) compiler.init() policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True) policy.build("planning") with compiler.graph.as_default(): reparameterization_map = compiler.get_cpfs_reparameterization() cell_samples = utils.get_noise_samples(reparameterization_map, BATCH_SIZE, horizon=1) cell_noise, encoding = utils.encode_noise_samples_as_inputs( cell_samples) cell = SimulationCell(compiler, policy, config={"encoding": encoding}) cell.cell_noise = cell_noise yield cell
def non_parallel_plans(compiler): policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=False) policy.build("non_parallel_plans") return policy
def parallel_plans(compiler): policy = OpenLoopPolicy(compiler, HORIZON, parallel_plans=True) policy.build("parallel_plans") return policy
def _build_policy_ops(self): horizon = self.config["horizon"] self.policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=True) self.policy.build("tensorplan")
def _build_base_policy_ops(self): horizon = 1 self.base_policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False) self.base_policy.build("base_policy")
class StraightLinePlanner(StochasticPlanner): """StraightLinePlanner class implements the online gradient-based planner that chooses the next action based on the lower bound of the Value function of the start state. Args: rddl (str): A RDDL domain/instance filepath or rddlgym id. config (Dict[str, Any]): The planner config dict. """ # pylint: disable=too-many-instance-attributes def __init__(self, rddl, config): super().__init__(rddl, ReparameterizationCompiler, config) self.policy = None self.simulator = None self.trajectory = None self.final_state = None self.total_reward = None self.avg_total_reward = None self.loss = None self.writer = None self.summaries = None def _build_policy_ops(self): horizon = self.horizon self.policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False) self.policy.build("planning") if "warm_start" in self.config: self.warm_start_op = self.policy._build_warm_start_op() def _build_trajectory_ops(self): with tf.name_scope("scenarios"): self.simulator = Simulator(self.compiler, self.policy, config=None) self.simulator.build() ( self.trajectory, self.final_state, self.total_reward, ) = self.simulator.trajectory(self.initial_state, self.sequence_length) def _build_loss_ops(self): with tf.name_scope("loss"): self.avg_total_reward = tf.reduce_mean(self.total_reward) self.loss = tf.square(self.avg_total_reward) def _build_summary_ops(self): if self.config["verbose"]: with tf.name_scope("summary"): _ = tf.compat.v1.summary.FileWriter(self.config["logdir"], self.graph) tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward) tf.compat.v1.summary.scalar("loss", self.loss) tf.compat.v1.summary.histogram("total_reward", self.total_reward) tf.compat.v1.summary.histogram("scenario_noise", self.simulator.noise) for grad, variable in self.grads_and_vars: var_name = variable.name tf.compat.v1.summary.histogram(f"{var_name}_grad", grad) tf.compat.v1.summary.histogram(var_name, variable) self.summaries = tf.compat.v1.summary.merge_all() def __call__(self, state, timestep): scenario_noise = utils.evaluate_noise_samples_as_inputs( self._sess, self.simulator.samples) steps_to_go = self.config["horizon"] - timestep if self.config.get("planning_horizon"): steps_to_go = min(steps_to_go, self.config["planning_horizon"]) feed_dict = { self.initial_state: self._get_batch_initial_state(state), self.simulator.noise: scenario_noise, self.steps_to_go: steps_to_go, } self.run(timestep, feed_dict) action = self._get_action(self.trajectory.actions, feed_dict) return action
class OnlineOpenLoopPlanner(object): '''OnlineOpenLoopPlanner implements a gradient-based planner that optimizes a sequence of actions in an online setting (i.e., interleaving planning and execution). Args: compiler (:obj:`rddl2tf.compiler.Compiler`): A RDDL2TensorFlow compiler. batch_size (int): The size of the batch used in policy simulation. horizon (int): The number of timesteps. parallel_plans (bool): The boolean flag for optimizing parallel sequence of actions. ''' def __init__(self, compiler: Compiler, batch_size: int, horizon: int, parallel_plans: bool = True) -> None: self._compiler = compiler self.batch_size = batch_size self.horizon = horizon self.parallel_plans = parallel_plans def build(self, learning_rate: int, epochs: int, show_progress: bool = True) -> None: '''Builds the online open loop planning ops.''' self.learning_rate = learning_rate self.epochs = epochs self.show_progress = show_progress self._build_policy_graph() self._build_optimizer_graph() def _build_policy_graph(self) -> None: '''Builds the open loop policy ops.''' self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans) self._policy.build('planning') def _build_optimizer_graph(self) -> None: '''Builds the action optimizer ops.''' self._optimizer = ActionOptimizer(self._compiler, self._policy) def __call__(self, state: StateTensor, t: int) -> Tuple[Action, PolicyVars]: '''Returns action to be executed in current `state` at timestep `t`. Args: state (StateTensor): The current state. t (int): The current timestep. Returns: Tuple[ActionArray, PolicyVarsArray]: The action and policy variables optimized for the current timestep. ''' # initialize action optimizer with self._compiler.graph.as_default(): with tf.name_scope('timestep{}'.format(t)): start = time.time() self._optimizer.build(self.learning_rate, self.batch_size, self.horizon - t, parallel_plans=False) end = time.time() building_time = end - start # optimize next action start = time.time() initial_state = tuple(self._batch_tensor(fluent) for fluent in state) actions, policy_vars = self._optimizer.run(self.epochs, initial_state, self.show_progress) # outputs action = tuple(fluent[0] for fluent in actions) policy_vars = tuple(np.expand_dims(var[(self.horizon-1) - t], axis=0) for var in policy_vars) end = time.time() optimization_time = end - start return action, policy_vars, building_time, optimization_time def _batch_tensor(self, fluent): tensor = np.stack([fluent[0]] * self.batch_size) if len(tensor.shape) == 1: tensor = np.expand_dims(tensor, -1) return tensor
class Tensorplan(Planner): """Tensorplan class implements the Planner interface for the offline gradient-based planner (i.e., tensorplan). Args: model (pyrddl.rddl.RDDL): A RDDL model. config (Dict[str, Any]): The planner config dict. """ # pylint: disable=too-many-instance-attributes def __init__(self, rddl, config): super(Tensorplan, self).__init__(rddl, DefaultCompiler, config) self.policy = None self.initial_state = None self.simulator = None self.trajectory = None self.final_state = None self.total_reward = None self.avg_total_reward = None self.loss = None self.optimizer = None self.train_op = None self.best_plan_idx = None self.best_plan = None self._plan = None self.writer = None self.summaries = None @property def logdir(self): return self.config.get( "logdir") or f"/tmp/tfplan/tensorplan/{self.rddl}" def build(self): """Builds planner ops.""" with self.graph.as_default(): self._build_policy_ops() self._build_initial_state_ops() self._build_trajectory_ops() self._build_loss_ops() self._build_optimization_ops() self._build_solution_ops() self._build_summary_ops() self._build_init_ops() def _build_init_ops(self): self.init_op = tf.global_variables_initializer() def _build_policy_ops(self): horizon = self.config["horizon"] self.policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=True) self.policy.build("tensorplan") def _build_initial_state_ops(self): self.initial_state = self.compiler.initial_state() def _build_trajectory_ops(self): self.simulator = Simulator(self.compiler, self.policy) self.simulator.build() self.trajectory, self.final_state, self.total_reward = self.simulator.trajectory( self.initial_state) def _build_loss_ops(self): with tf.name_scope("loss"): self.avg_total_reward = tf.reduce_mean(self.total_reward) self.loss = tf.square(self.avg_total_reward) def _build_optimization_ops(self): self.optimizer = ActionOptimizer(self.config["optimization"]) self.optimizer.build() self.train_op = self.optimizer.minimize(self.loss) def _build_solution_ops(self): self.best_plan_idx = tf.argmax(self.total_reward, axis=0) self.best_plan = tuple(action[self.best_plan_idx] for action in self.trajectory.actions) def _build_summary_ops(self): tf.compat.v1.summary.histogram("total_reward", self.total_reward) tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward) tf.compat.v1.summary.scalar("loss", self.loss) self.summaries = tf.compat.v1.summary.merge_all() def run(self): """Run the planner for the given number of epochs. Returns: plan (Sequence(np.ndarray): The best solution plan. """ self.writer = tf.compat.v1.summary.FileWriter(self.logdir, self.graph) self._sess.run(self.init_op) run_id = self.config.get("run_id", 0) pid = os.getpid() position = run_id % self.config.get("num_workers", 1) epochs = self.config["epochs"] desc = f"(pid={pid}) Run #{run_id:<3d}" with trange(epochs, desc=desc, unit="epoch", position=position, leave=False) as t: for step in t: _, loss_, avg_total_reward_, summary_ = self._sess.run([ self.train_op, self.loss, self.avg_total_reward, self.summaries ]) self.writer.add_summary(summary_, step) t.set_postfix(loss=f"{loss_:10.4f}", avg_total_reward=f"{avg_total_reward_:10.4f}") self.writer.close() plan_ = self._sess.run(self.best_plan) return plan_ def __call__(self, state, timestep): """Returns the action for the given `timestep`.""" # find plan if self._plan is None: self._plan = self.run() # select action for given timestep action_fluent_ordering = self.compiler.rddl.domain.action_fluent_ordering action = OrderedDict({ name: action[timestep] for name, action in zip(action_fluent_ordering, self._plan) }) return action
def _build_policy_graph(self) -> None: '''Builds the open loop policy ops.''' self._policy = OpenLoopPolicy(self._compiler, self.batch_size, self.horizon, self.parallel_plans) self._policy.build('planning')
class HindsightPlanner(StochasticPlanner): """HindsightPlanner class implements an online gradient-based planner that chooses the next action based on the upper bound of the Value function of the current state. Args: rddl (str): A RDDL domain/instance filepath or rddlgym id. config (Dict[str, Any]): The planner config dict. """ # pylint: disable=too-many-instance-attributes def __init__(self, rddl, config): super().__init__(rddl, ReparameterizationCompiler, config) self.base_policy = None self.scenario_policy = None self.next_state = None self.cell = None self.cell_noise = None self.cell_samples = None self.action = None self.reward = None self.simulator = None self.trajectory = None self.final_state = None self.scenario_total_reward = None self.total_reward = None self.avg_total_reward = None self.loss = None self.writer = None self.summaries = None def _build_policy_ops(self): self._build_base_policy_ops() self._build_scenario_policy_ops() if "warm_start" in self.config: warm_start_base_policy = tf.group(*[ tf.assign(var1[:, 0, :], tf.reduce_mean(var2[:, 0, :], axis=0, keepdims=True)) for var1, var2 in zip( self.base_policy._policy_variables, self.scenario_policy._policy_variables) ]) with tf.control_dependencies([warm_start_base_policy]): warm_start_scenario_policy = self.scenario_policy._build_warm_start_op() self.warm_start_op = tf.group(warm_start_base_policy, warm_start_scenario_policy) def _build_base_policy_ops(self): horizon = 1 self.base_policy = OpenLoopPolicy(self.compiler, horizon, parallel_plans=False) self.base_policy.build("base_policy") def _build_scenario_policy_ops(self): horizon = self.horizon - 1 self.scenario_policy = OpenLoopPolicy( self.compiler, horizon, parallel_plans=True ) self.scenario_policy.build("scenario_policy") def _build_trajectory_ops(self): self._build_scenario_start_states_ops() self._build_scenario_trajectory_ops() def _build_scenario_start_states_ops(self): with tf.name_scope("current_action"): with tf.name_scope("reparameterization"): reparameterization_map = self.compiler.get_cpfs_reparameterization() self.cell_samples = utils.get_noise_samples( reparameterization_map, self.batch_size, horizon=1 ) self.cell_noise, encoding = utils.encode_noise_samples_as_inputs( self.cell_samples ) self.cell = SimulationCell( self.compiler, self.base_policy, config={"encoding": encoding} ) timesteps = tf.zeros((self.batch_size, 1), dtype=tf.float32) inputs = tf.concat([timesteps, self.cell_noise[:, 0, ...]], axis=1) output, self.next_state = self.cell(inputs, self.initial_state) self.action = output[1] self.reward = tf.squeeze(output[3]) def _build_scenario_trajectory_ops(self): with tf.name_scope("scenarios"): self.simulator = Simulator(self.compiler, self.scenario_policy, config=None) self.simulator.build() ( self.trajectory, self.final_state, self.scenario_total_reward, ) = self.simulator.trajectory(self.next_state, self.sequence_length) def _build_loss_ops(self): with tf.name_scope("loss"): self.total_reward = self.reward + self.scenario_total_reward self.avg_total_reward = tf.reduce_mean(self.total_reward) self.loss = tf.square(self.avg_total_reward) def _build_summary_ops(self): if self.config["verbose"]: with tf.name_scope("summary"): _ = tf.compat.v1.summary.FileWriter(self.config["logdir"], self.graph) tf.compat.v1.summary.scalar("avg_total_reward", self.avg_total_reward) tf.compat.v1.summary.scalar("loss", self.loss) tf.compat.v1.summary.histogram("reward", self.reward) tf.compat.v1.summary.histogram( "scenario_total_reward", self.scenario_total_reward ) tf.compat.v1.summary.histogram("total_reward", self.total_reward) tf.compat.v1.summary.histogram("next_state_noise", self.cell_noise) tf.compat.v1.summary.histogram("scenario_noise", self.simulator.noise) for grad, variable in self.grads_and_vars: var_name = variable.name tf.compat.v1.summary.histogram(f"{var_name}_grad", grad) tf.compat.v1.summary.histogram(var_name, variable) self.summaries = tf.compat.v1.summary.merge_all() def __call__(self, state, timestep): next_state_noise = utils.evaluate_noise_samples_as_inputs( self._sess, self.cell_samples ) scenario_noise = utils.evaluate_noise_samples_as_inputs( self._sess, self.simulator.samples ) steps_to_go = self.config["horizon"] - timestep if self.config.get("planning_horizon"): steps_to_go = min(steps_to_go, self.config["planning_horizon"]) steps_to_go -= 1 feed_dict = { self.initial_state: self._get_batch_initial_state(state), self.cell_noise: next_state_noise, self.simulator.noise: scenario_noise, self.steps_to_go: steps_to_go } self.run(timestep, feed_dict) action = self._get_action(self.action, feed_dict) return action