Esempio n. 1
0
    def start(self):
        timing = AttrDict({'copying': 0, 'prediction': 0})

        while True:
            actions, step_type = self.step_queue.get()
            if actions is None:  # stop signal
                for i, e in enumerate(self.envs):
                    log.info('Closing env %d', self.env_indices[i])
                    e.close()
                log.info('Stop worker %r...', self.env_indices)
                break

            if step_type == StepType.REAL:
                envs = self.envs
                self.imagined_envs = None
            else:  # step_type == StepType.IMAGINED:

                if self.imagined_envs is None:
                    # initializing new prediction, let's report timing for the previous one
                    if timing.prediction > 0 and self._verbose:
                        log.debug(
                            'Multi-env copy took %.6f s, prediction took %.6f s',
                            timing.copying,
                            timing.prediction,
                        )

                    timing.prediction = 0
                    timing.copying = time.time()

                    self.imagined_envs = []
                    # we expect a list of actions for every environment in this worker (list of lists)
                    assert len(actions) == len(self.envs)
                    for env_idx in range(len(actions)):
                        for _ in actions[env_idx]:
                            imagined_env = copy.deepcopy(self.envs[env_idx])
                            self.imagined_envs.append(imagined_env)
                    timing.copying = time.time() - timing.copying

                envs = self.imagined_envs
                actions = np.asarray(actions).flatten()

            assert len(envs) == len(actions)

            # Collect obs, reward, and 'done' for each env (discard info)
            prediction_start = time.time()
            results = [env.step(action) for env, action in zip(envs, actions)]

            # pack results per-env
            results = np.split(np.array(results), len(self.envs))

            if step_type == StepType.IMAGINED:
                timing.prediction += time.time() - prediction_start

            # If this is a real step and the env is done, reset
            if step_type == StepType.REAL:
                for i, result in enumerate(results):
                    obs, reward, done, info = result[0]
                    if done:
                        obs = self.envs[i].reset()
                    results[i] = (obs, reward, done, info
                                  )  # collapse dimension of size 1

            self.result_queue.put(results)
            self.step_queue.task_done()
Esempio n. 2
0
    def start(self):
        real_envs = []
        imagined_envs = None

        timing = AttrDict({'copying': 0, 'prediction': 0})

        while True:
            actions, msg_type = safe_get(self.task_queue)

            if msg_type == MsgType.INIT:
                self._init(real_envs)
                self.task_queue.task_done()
                continue

            if msg_type == MsgType.TERMINATE:
                self._terminate(real_envs, imagined_envs)
                self.task_queue.task_done()
                break

            # handling actual workload
            envs = real_envs
            if msg_type == MsgType.RESET or msg_type == MsgType.STEP_REAL or msg_type == MsgType.STEP_REAL_RESET:
                if imagined_envs is not None:
                    for imagined_env in imagined_envs:
                        imagined_env.close()
                imagined_envs = None
            elif msg_type == MsgType.INFO:
                pass
            else:

                if imagined_envs is None:
                    # initializing new prediction, let's report timing for the previous one
                    if timing.prediction > 0 and self._verbose:
                        log.debug(
                            'Multi-env copy took %.6f s, prediction took %.6f s',
                            timing.copying,
                            timing.prediction,
                        )

                    timing.prediction = 0
                    timing.copying = time.time()

                    imagined_envs = []
                    # we expect a list of actions for every environment in this worker (list of lists)
                    assert len(actions) == len(real_envs)
                    for env_idx in range(len(actions)):
                        for _ in actions[env_idx]:
                            imagined_env = copy.deepcopy(real_envs[env_idx])
                            imagined_envs.append(imagined_env)
                    timing.copying = time.time() - timing.copying

                envs = imagined_envs
                actions = np.asarray(actions).flatten()

            if msg_type == MsgType.RESET:
                results = [env.reset() for env in envs]
            elif msg_type == MsgType.INFO:
                results = [self._get_info(env) for env in envs]
            else:
                assert len(envs) == len(actions)

                reset = [False] * len(actions)
                if msg_type == MsgType.STEP_REAL_RESET:
                    actions, reset = zip(*actions)

                # Collect obs, reward, done, and info
                prediction_start = time.time()
                results = [
                    env.step(action) for env, action in zip(envs, actions)
                ]
                self.timestep += 1

                # pack results per-env
                results = np.split(np.array(results), len(real_envs))

                if msg_type == MsgType.STEP_IMAGINED:
                    timing.prediction += time.time() - prediction_start

                # If this is a real step and the env is done, reset
                if msg_type == MsgType.STEP_REAL or msg_type == MsgType.STEP_REAL_RESET:
                    for i, result in enumerate(results):
                        obs, reward, done, info = result[0]

                        if self.is_multiagent and all(done):
                            is_done = True
                        elif not self.is_multiagent and done:
                            is_done = True
                        else:
                            is_done = False

                        if is_done or reset[i]:
                            obs = real_envs[i].reset()
                            if not self.is_multiagent:
                                info = self._get_info(
                                    real_envs[i])  # info for the new episode

                        results[i] = (obs, reward, done, info)

            self.result_queue.put(results)
            self.task_queue.task_done()