def test_step_takes_steps_for_all_non_waiting_envs(self):
     SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker(
         worker_id, EnvironmentResponse("step", worker_id, worker_id)
     )
     manager = SubprocessEnvManager(mock_env_factory, 3)
     manager.step_queue = Mock()
     manager.step_queue.get_nowait.side_effect = [
         EnvironmentResponse("step", 0, StepResponse(0, None)),
         EnvironmentResponse("step", 1, StepResponse(1, None)),
         EmptyQueue(),
     ]
     step_mock = Mock()
     last_steps = [Mock(), Mock(), Mock()]
     manager.env_workers[0].previous_step = last_steps[0]
     manager.env_workers[1].previous_step = last_steps[1]
     manager.env_workers[2].previous_step = last_steps[2]
     manager.env_workers[2].waiting = True
     manager._take_step = Mock(return_value=step_mock)
     res = manager.step()
     for i, env in enumerate(manager.env_workers):
         if i < 2:
             env.send.assert_called_with("step", step_mock)
             manager.step_queue.get_nowait.assert_called()
             # Check that the "last steps" are set to the value returned for each step
             self.assertEqual(
                 manager.env_workers[i].previous_step.current_all_brain_info, i
             )
             self.assertEqual(
                 manager.env_workers[i].previous_step.previous_all_brain_info,
                 last_steps[i].current_all_brain_info,
             )
     assert res == [
         manager.env_workers[0].previous_step,
         manager.env_workers[1].previous_step,
     ]
Esempio n. 2
0
    def advance(self, env: SubprocessEnvManager) -> int:
        with hierarchical_timer("env_step"):
            time_start_step = time()
            new_step_infos = env.step()
            delta_time_step = time() - time_start_step

        for step_info in new_step_infos:
            for brain_name, trainer in self.trainers.items():
                if brain_name in self.trainer_metrics:
                    self.trainer_metrics[brain_name].add_delta_step(
                        delta_time_step)
                trainer.add_experiences(
                    step_info.previous_all_brain_info,
                    step_info.current_all_brain_info,
                    step_info.brain_name_to_action_info[brain_name].outputs,
                )
                trainer.process_experiences(step_info.previous_all_brain_info,
                                            step_info.current_all_brain_info)
        for brain_name, trainer in self.trainers.items():
            if brain_name in self.trainer_metrics:
                self.trainer_metrics[brain_name].add_delta_step(
                    delta_time_step)
            if self.train_model and trainer.get_step <= trainer.get_max_steps:
                trainer.increment_step(len(new_step_infos))
                if trainer.is_ready_update():
                    # Perform gradient descent with experience buffer
                    with hierarchical_timer("update_policy"):
                        trainer.update_policy()
                    env.set_policy(brain_name, trainer.policy)
        return len(new_step_infos)