def recv( self ) -> Union[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], Tuple[ np.ndarray, dict], np.ndarray]: # noqa:E125 """Receive result from low-level worker. If the last "send" function sends a NULL action, it only returns a single observation; otherwise it returns a tuple of (obs, rew, done, info). """ if hasattr(self, "get_result"): deprecation("get_result will soon be deprecated. " "Please use send and recv for your own EnvWorker.") if not self.is_reset: self.result = self.get_result() # type: ignore return self.result
def send(self, action: Optional[np.ndarray]) -> None: """Send action signal to low-level worker. When action is None, it indicates sending "reset" signal; otherwise it indicates "step" signal. The paired return value from "recv" function is determined by such kind of different signal. """ if hasattr(self, "send_action"): deprecation("send_action will soon be deprecated. " "Please use send and recv for your own EnvWorker.") if action is None: self.is_reset = True self.result = self.reset() else: self.is_reset = False self.send_action(action) # type: ignore
def __init__( self, learning_type: str, policy: BasePolicy, max_epoch: int, batch_size: int, train_collector: Optional[Collector] = None, test_collector: Optional[Collector] = None, buffer: Optional[ReplayBuffer] = None, step_per_epoch: Optional[int] = None, repeat_per_collect: Optional[int] = None, episode_per_test: Optional[int] = None, update_per_step: Union[int, float] = 1, update_per_epoch: Optional[int] = None, step_per_collect: Optional[int] = None, episode_per_collect: Optional[int] = None, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_best_fn: Optional[Callable[[BasePolicy], None]] = None, save_checkpoint_fn: Optional[Callable[[int, int, int], str]] = None, resume_from_log: bool = False, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, show_progress: bool = True, test_in_train: bool = True, save_fn: Optional[Callable[[BasePolicy], None]] = None, ): if save_fn: deprecation( "save_fn in trainer is marked as deprecated and will be " "removed in the future. Please use save_best_fn instead.") assert save_best_fn is None save_best_fn = save_fn self.policy = policy self.buffer = buffer self.train_collector = train_collector self.test_collector = test_collector self.logger = logger self.start_time = time.time() self.stat: DefaultDict[str, MovAvg] = defaultdict(MovAvg) self.best_reward = 0.0 self.best_reward_std = 0.0 self.start_epoch = 0 self.gradient_step = 0 self.env_step = 0 self.max_epoch = max_epoch self.step_per_epoch = step_per_epoch # either on of these two self.step_per_collect = step_per_collect self.episode_per_collect = episode_per_collect self.update_per_step = update_per_step self.repeat_per_collect = repeat_per_collect self.episode_per_test = episode_per_test self.batch_size = batch_size self.train_fn = train_fn self.test_fn = test_fn self.stop_fn = stop_fn self.save_best_fn = save_best_fn self.save_checkpoint_fn = save_checkpoint_fn self.reward_metric = reward_metric self.verbose = verbose self.show_progress = show_progress self.test_in_train = test_in_train self.resume_from_log = resume_from_log self.is_run = False self.last_rew, self.last_len = 0.0, 0 self.epoch = self.start_epoch self.best_epoch = self.start_epoch self.stop_fn_flag = False self.iter_num = 0