def _flip_past(self, observation_n, reward_n, done_n, info): # Wait until all observations are past the corresponding reset times remote_target_time = [ info_i['reward_buffer.remote_time'] for info_i in info['n'] ] while True: new_observation_n, new_reward_n, new_done_n, new_info = self.env.step( [[] for i in range(self.n)]) # info_i.get['diagnostics.image_remote_time'] may not exist, for example when an env # is resetting. target is a timestamp, thus > 0, so these will count as "need to catch up" deltas = [ target - info_i.get('diagnostics.image_remote_time', 0) for target, info_i in zip(remote_target_time, new_info['n']) ] count = len([d for d in deltas if d > 0]) rewarder.merge_n(observation_n, reward_n, done_n, info, new_observation_n, new_reward_n, new_done_n, new_info) if count == 0: return else: logger.debug( '[LabCoreSync] Still waiting on %d envs to catch up to their targets: %s', count, deltas)
def _step(self, action_n): observation_n, reward_n, done_n, info = self.env.step(action_n) if self.reward_n is not None: rewarder.merge_n( observation_n, reward_n, done_n, info, [None] * self.n, self.reward_n, self.done_n, self.info, ) self.reward_n = self.done_n = self.info = None return self._observation(done_n, info), reward_n, done_n, info
def _reset(self): observation_n = self.env.reset() self.reward_n = [0] * self.n self.done_n = [False] * self.n self.info = {'n': [{} for _ in range(self.n)]} new_observation_n, new_reward_n, new_done_n, new_info = self.env.step([[] for i in range(self.n)]) rewarder.merge_n( observation_n, self.reward_n, self.done_n, self.info, new_observation_n, new_reward_n, new_done_n, new_info ) return self._observation(self.done_n, self.info)
def _step(self, action_n): # Add C keypress in order to "commit" the action, as # interpreted by the remote. action_n = [ action + [ spaces.KeyEvent.by_name('c', down=True), spaces.KeyEvent.by_name('c', down=False) ] for action in action_n ] observation_n, reward_n, done_n, info = self.env.step(action_n) if self.reward_n is not None: rewarder.merge_n( observation_n, reward_n, done_n, info, [None] * self.n, self.reward_n, self.done_n, self.info, ) self.reward_n = self.done_n = self.info = None while True: count = len([ True for info_i in info['n'] if info_i['stats.reward.count'] == 0 ]) if count > 0: logger.debug( '[LabCoreSync] Still waiting on %d envs to receive their post-commit reward', count) else: break new_observation_n, new_reward_n, new_done_n, new_info = self.env.step( [[] for i in range(self.n)]) rewarder.merge_n(observation_n, reward_n, done_n, info, new_observation_n, new_reward_n, new_done_n, new_info) assert all( info_i['stats.reward.count'] == 1 for info_i in info['n'] ), "Expected all stats.reward.counts to be 1: {}".format(info) # Fast forward until the observation is caught up with the rewarder self._flip_past(observation_n, reward_n, done_n, info) return observation_n, reward_n, done_n, info
def _reset(self): observation_n = self.env.reset() self.reward_n = [0] * self.n self.done_n = [False] * self.n self.info = {'n': [{} for _ in range(self.n)]} while any(ob is None for ob in observation_n): action_n = [] for done in self.done_n: if done: # No popping of reward/done. Don't want to merge across episode boundaries. action_n.append([spaces.PeekReward]) else: action_n.append([]) new_observation_n, new_reward_n, new_done_n, new_info = self.env.step( action_n) rewarder.merge_n(observation_n, self.reward_n, self.done_n, self.info, new_observation_n, new_reward_n, new_done_n, new_info) return observation_n
def _step(self, action_n): observation_n, reward_n, done_n, info = self.env.step(action_n) if self.reward_n is not None: rewarder.merge_n(observation_n, reward_n, done_n, info, [None] * self.n, self.reward_n, self.done_n, self.info) self.reward_n = self.done_n = self.info = None while any(ob is None for ob in observation_n): action_n = [] for done in done_n: if done: # No popping of reward/done. Don't want to merge across episode boundaries. action_n.append([spaces.PeekReward]) else: action_n.append([]) new_observation_n, new_reward_n, new_done_n, new_info = self.env.step( action_n) rewarder.merge_n(observation_n, reward_n, done_n, info, new_observation_n, new_reward_n, new_done_n, new_info) return observation_n, reward_n, done_n, info