def run_episode(env: Env, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=reward, terminal=terminal, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs num_steps += 1 return trajectory
def training_step(self, batch, batch_idx: int, optimizer_idx: int = 0): assert (optimizer_idx == 0) or (self._num_optimizing_steps > 1) if self._training_step_generator is None: if self._training_batch_type and isinstance(batch, dict): batch = self._training_batch_type.from_dict(batch) self._training_step_generator = self.train_step_gen( batch, batch_idx) ret = next(self._training_step_generator) if optimizer_idx == self._num_optimizing_steps - 1: if not self._verified_steps: try: next(self._training_step_generator) except StopIteration: self._verified_steps = True if not self._verified_steps: raise RuntimeError( "training_step_gen() yields too many times." "The number of yields should match the number of optimizers," f" in this case {self._num_optimizing_steps}") self._training_step_generator = None SummaryWriterContext.increase_global_step() return ret
def __iter__(self): t = tqdm(total=self.dataloader_size, desc="iterating dataloader") for batch in self.dataloader: batch_size = get_batch_size(batch) yield batch t.update(batch_size) SummaryWriterContext.increase_global_step() # clean up if need to (e.g. Petastorm Dataloader) if hasattr(self.dataloader, "__exit__"): self.dataloader.__exit__(None, None, None)
async def async_run_episode( env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None, fill_info: bool = False, ) -> Trajectory: """ NOTE: this funciton is an async coroutine in order to support async env.step(). If you are using it with regular env.step() method, use non-async run_episode(), which wraps this function. Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 step_is_coroutine = asyncio.iscoroutinefunction(env.step) while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) if step_is_coroutine: next_obs, reward, terminal, info = await env.step(action) else: next_obs, reward, terminal, info = env.step(action) if not fill_info: info = None next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, info=info, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def test_global_step(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) SummaryWriterContext.increase_global_step() SummaryWriterContext.add_scalar("test", torch.zeros(1)) writer.add_scalar.assert_has_calls([ call("test", torch.ones(1), global_step=0), call("test", torch.zeros(1), global_step=1), ]) self.assertEqual(2, len(writer.add_scalar.mock_calls))
def train_network(self, train_dataset, eval_dataset, epochs: int): num_batches = int(len(train_dataset) / self.minibatch_size) logger.info("Read in batch data set of size {} examples. Data split " "into {} batches of size {}.".format( len(train_dataset), num_batches, self.minibatch_size)) start_time = time.time() for epoch in range(epochs): train_dataset.reset_iterator() data_streamer = DataStreamer(train_dataset, pin_memory=self.trainer.use_gpu) feed_pages( data_streamer, len(train_dataset), epoch, self.minibatch_size, self.trainer.use_gpu, TrainingPageHandler(self.trainer), batch_preprocessor=self.batch_preprocessor, ) if hasattr(self.trainer, "q_network_cpe"): # TODO: Add CPE support to SAC eval_dataset.reset_iterator() data_streamer = DataStreamer(eval_dataset, pin_memory=self.trainer.use_gpu) eval_page_handler = EvaluationPageHandler( self.trainer, self.evaluator, self) feed_pages( data_streamer, len(eval_dataset), epoch, self.minibatch_size, self.trainer.use_gpu, eval_page_handler, batch_preprocessor=self.batch_preprocessor, ) SummaryWriterContext.increase_global_step() through_put = (len(train_dataset) * epochs) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put)))
def training_step(self, batch, batch_idx: int, optimizer_idx: int): if self._training_step_generator is None: self._training_step_generator = self.train_step_gen(batch, batch_idx) ret = next(self._training_step_generator) if optimizer_idx == self._num_optimizing_steps - 1: if not self._verified_steps: try: next(self._training_step_generator) except StopIteration: self._verified_steps = True if not self._verified_steps: raise RuntimeError("training_step_gen() yields too many times") self._training_step_generator = None SummaryWriterContext.increase_global_step() return ret
def run_episode(env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() # pyre-fixme[16]: `EnvWrapper` has no attribute `reset`. obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) # pyre-fixme[16]: `EnvWrapper` has no attribute `step`. next_obs, reward, terminal, _ = env.step(action) next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def run_episode(env: Env, agent: Agent, max_steps: Optional[int] = None) -> float: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. """ ep_reward = 0.0 obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) obs = next_obs ep_reward += reward num_steps += 1 if max_steps is not None and num_steps > max_steps: terminal = True agent.post_step(reward, terminal) SummaryWriterContext.increase_global_step() return ep_reward
def handle(self, tdp: PreprocessedTrainingBatch) -> None: SummaryWriterContext.increase_global_step() self.trainer_or_evaluator.train(tdp)