コード例 #1
0
ファイル: appo.py プロジェクト: synctext/sample-factory
    def init_workers(self):
        """
        Initialize all types of workers and start their worker processes.
        """

        actor_queues = [faster_fifo.Queue() for _ in range(self.cfg.num_workers)]

        policy_worker_queues = dict()
        for policy_id in range(self.cfg.num_policies):
            policy_worker_queues[policy_id] = []
            for i in range(self.cfg.policy_workers_per_policy):
                policy_worker_queues[policy_id].append(TorchJoinableQueue())

        log.info('Initializing learners...')
        policy_locks = [multiprocessing.Lock() for _ in range(self.cfg.num_policies)]
        resume_experience_collection_cv = [multiprocessing.Condition() for _ in range(self.cfg.num_policies)]

        learner_idx = 0
        for policy_id in range(self.cfg.num_policies):
            learner_worker = LearnerWorker(
                learner_idx, policy_id, self.cfg, self.obs_space, self.action_space,
                self.report_queue, policy_worker_queues[policy_id], self.traj_buffers,
                policy_locks[policy_id], resume_experience_collection_cv[policy_id],
            )
            learner_worker.start_process()
            learner_worker.init()

            self.learner_workers[policy_id] = learner_worker
            learner_idx += 1

        log.info('Initializing policy workers...')
        for policy_id in range(self.cfg.num_policies):
            self.policy_workers[policy_id] = []

            policy_queue = faster_fifo.Queue()
            self.policy_queues[policy_id] = policy_queue

            for i in range(self.cfg.policy_workers_per_policy):
                policy_worker = PolicyWorker(
                    i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers,
                    policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i],
                    policy_locks[policy_id], resume_experience_collection_cv[policy_id],
                )
                self.policy_workers[policy_id].append(policy_worker)
                policy_worker.start_process()

        log.info('Initializing actors...')

        # We support actor worker initialization in groups, which can be useful for some envs that
        # e.g. crash when too many environments are being initialized in parallel.
        # Currently the limit is not used since it is not required for any envs supported out of the box,
        # so we parallelize initialization as hard as we can.
        # If this is required for your environment, perhaps a better solution would be to use global locks,
        # like FileLock (see doom_gym.py)
        self.actor_workers = []
        max_parallel_init = int(1e9)  # might be useful to limit this for some envs
        worker_indices = list(range(self.cfg.num_workers))
        for i in range(0, self.cfg.num_workers, max_parallel_init):
            workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues)
            self.actor_workers.extend(workers)
コード例 #2
0
    def __init__(self,
                 player_id,
                 make_env_func,
                 env_config,
                 use_multiprocessing=False,
                 reset_on_init=True):
        self.player_id = player_id
        self.make_env_func = make_env_func
        self.env_config = env_config
        self.reset_on_init = reset_on_init
        if use_multiprocessing:
            self.process = Process(target=self.start, daemon=False)
            self.task_queue, self.result_queue = faster_fifo.Queue(
            ), faster_fifo.Queue()
        else:
            self.process = threading.Thread(target=self.start)
            self.task_queue, self.result_queue = Queue(), Queue()

        self.process.start()
コード例 #3
0
ファイル: learner.py プロジェクト: erikwijmans/sample-factory
    def __init__(
        self,
        worker_idx,
        policy_id,
        cfg,
        obs_space,
        action_space,
        report_queue,
        policy_worker_queues,
        shared_buffers,
        policy_lock,
        resume_experience_collection_cv,
    ):
        log.info('Initializing the learner %d for policy %d', worker_idx,
                 policy_id)

        self.worker_idx = worker_idx
        self.policy_id = policy_id

        self.cfg = cfg

        # PBT-related stuff
        self.should_save_model = True  # set to true if we need to save the model to disk on the next training iteration
        self.load_policy_id = None  # non-None when we need to replace our parameters with another policy's parameters
        self.pbt_mutex = threading.Lock()
        self.new_cfg = None  # non-None when we need to update the learning hyperparameters

        self.terminate = False

        self.obs_space = obs_space
        self.action_space = action_space

        self.rollout_tensors = shared_buffers.tensor_trajectories
        self.traj_tensors_available = shared_buffers.is_traj_tensor_available
        self.policy_versions = shared_buffers.policy_versions
        self.stop_experience_collection = shared_buffers.stop_experience_collection

        self.device = None
        self.actor_critic = None
        self.optimizer = None
        self.policy_lock = policy_lock
        self.resume_experience_collection_cv = resume_experience_collection_cv

        self.task_queue = faster_fifo.Queue()
        self.report_queue = report_queue

        self.initialized_event = MultiprocessingEvent()
        self.initialized_event.clear()

        self.model_saved_event = MultiprocessingEvent()
        self.model_saved_event.clear()

        # queues corresponding to policy workers using the same policy
        # we send weight updates via these queues
        self.policy_worker_queues = policy_worker_queues

        self.experience_buffer_queue = Queue()

        self.tensor_batch_pool = ObjectPool()
        self.tensor_batcher = TensorBatcher(self.tensor_batch_pool)

        self.with_training = True  # set to False for debugging no-training regime
        self.train_in_background = self.cfg.train_in_background_thread  # set to False for debugging

        self.training_thread = Thread(
            target=self._train_loop) if self.train_in_background else None
        self.train_thread_initialized = threading.Event()

        self.is_training = False

        self.train_step = self.env_steps = 0

        # decay rate at which summaries are collected
        # save summaries every 20 seconds in the beginning, but decay to every 4 minutes in the limit, because we
        # do not need frequent summaries for longer experiments
        self.summary_rate_decay_seconds = LinearDecay([(0, 20), (100000, 120),
                                                       (1000000, 240)])
        self.last_summary_time = 0

        self.last_saved_time = self.last_milestone_time = 0

        self.discarded_experience_over_time = deque([], maxlen=30)
        self.discarded_experience_timer = time.time()
        self.num_discarded_rollouts = 0

        self.process = Process(target=self._run, daemon=True)
コード例 #4
0
ファイル: appo.py プロジェクト: synctext/sample-factory
    def __init__(self, cfg):
        super().__init__(cfg)

        tmp_env = make_env_func(self.cfg, env_config=None)
        self.obs_space = tmp_env.observation_space
        self.action_space = tmp_env.action_space
        self.num_agents = tmp_env.num_agents

        self.reward_shaping_scheme = None
        if self.cfg.with_pbt:
            if hasattr(tmp_env.unwrapped, '_reward_shaping_wrapper'):
                # noinspection PyProtectedMember
                self.reward_shaping_scheme = tmp_env.unwrapped._reward_shaping_wrapper.reward_shaping_scheme
            else:
                try:
                    from envs.doom.multiplayer.doom_multiagent_wrapper import MultiAgentEnv
                    if isinstance(tmp_env.unwrapped, MultiAgentEnv):
                        self.reward_shaping_scheme = tmp_env.unwrapped.default_reward_shaping
                except ImportError:
                    pass

        tmp_env.close()

        # shared memory allocation
        self.traj_buffers = SharedBuffers(self.cfg, self.num_agents, self.obs_space, self.action_space)

        self.actor_workers = None

        self.report_queue = faster_fifo.Queue(20 * 1000 * 1000)
        self.policy_workers = dict()
        self.policy_queues = dict()

        self.learner_workers = dict()

        self.workers_by_handle = None

        self.policy_inputs = [[] for _ in range(self.cfg.num_policies)]
        self.policy_outputs = dict()
        for worker_idx in range(self.cfg.num_workers):
            for split_idx in range(self.cfg.worker_num_splits):
                self.policy_outputs[(worker_idx, split_idx)] = dict()

        self.policy_avg_stats = dict()
        self.policy_lag = [dict() for _ in range(self.cfg.num_policies)]

        self.last_timing = dict()
        self.env_steps = dict()
        self.samples_collected = [0 for _ in range(self.cfg.num_policies)]
        self.total_env_steps_since_resume = 0

        # currently this applies only to the current run, not experiment as a whole
        # to change this behavior we'd need to save the state of the main loop to a filesystem
        self.total_train_seconds = 0

        self.last_report = time.time()
        self.last_experiment_summaries = 0

        self.report_interval = 5.0  # sec
        self.experiment_summaries_interval = self.cfg.experiment_summaries_interval  # sec

        self.avg_stats_intervals = (2, 12, 60)  # 10 seconds, 1 minute, 5 minutes

        self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals))
        self.throughput_stats = [deque([], maxlen=5) for _ in range(self.cfg.num_policies)]
        self.avg_stats = dict()
        self.stats = dict()  # regular (non-averaged) stats

        self.writers = dict()
        writer_keys = list(range(self.cfg.num_policies))
        for key in writer_keys:
            summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)), str(key))
            summary_dir = ensure_dir_exists(summary_dir)
            self.writers[key] = SummaryWriter(summary_dir, flush_secs=20)

        self.pbt = PopulationBasedTraining(self.cfg, self.reward_shaping_scheme, self.writers)
コード例 #5
0
    def init_workers(self):
        actor_queues = [
            faster_fifo.Queue() for _ in range(self.cfg.num_workers)
        ]

        policy_worker_queues = dict()
        for policy_id in range(self.cfg.num_policies):
            policy_worker_queues[policy_id] = []
            for i in range(self.cfg.policy_workers_per_policy):
                policy_worker_queues[policy_id].append(TorchJoinableQueue())

        log.info('Initializing learners...')
        policy_locks = [
            multiprocessing.Lock() for _ in range(self.cfg.num_policies)
        ]
        resume_experience_collection_cv = [
            multiprocessing.Condition() for _ in range(self.cfg.num_policies)
        ]

        learner_idx = 0
        for policy_id in range(self.cfg.num_policies):
            learner_worker = LearnerWorker(
                learner_idx,
                policy_id,
                self.cfg,
                self.obs_space,
                self.action_space,
                self.report_queue,
                policy_worker_queues[policy_id],
                self.traj_buffers,
                policy_locks[policy_id],
                resume_experience_collection_cv[policy_id],
            )
            learner_worker.start_process()
            learner_worker.init()

            self.learner_workers[policy_id] = learner_worker
            learner_idx += 1

        log.info('Initializing policy workers...')
        for policy_id in range(self.cfg.num_policies):
            self.policy_workers[policy_id] = []

            policy_queue = faster_fifo.Queue()
            self.policy_queues[policy_id] = policy_queue

            for i in range(self.cfg.policy_workers_per_policy):
                policy_worker = PolicyWorker(
                    i,
                    policy_id,
                    self.cfg,
                    self.obs_space,
                    self.action_space,
                    self.traj_buffers,
                    policy_queue,
                    actor_queues,
                    self.report_queue,
                    policy_worker_queues[policy_id][i],
                    policy_locks[policy_id],
                    resume_experience_collection_cv[policy_id],
                )
                self.policy_workers[policy_id].append(policy_worker)
                policy_worker.start_process()

        log.info('Initializing actors...')

        self.actor_workers = []
        max_parallel_init = int(
            1e9)  # might be useful to limit this for some envs
        worker_indices = list(range(self.cfg.num_workers))
        for i in range(0, self.cfg.num_workers, max_parallel_init):
            workers = self.init_subset(worker_indices[i:i + max_parallel_init],
                                       actor_queues)
            self.actor_workers.extend(workers)
コード例 #6
0
    def __init__(self, cfg, num_agents, obs_space, action_space):
        self.cfg = cfg
        self.num_agents = num_agents
        self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits
        self.num_traj_buffers = self.calc_num_trajectory_buffers()

        num_actions = calc_num_actions(action_space)
        num_action_logits = calc_num_logits(action_space)

        hidden_size = get_hidden_size(self.cfg)

        log.debug('Allocating shared memory for trajectories')
        self._tensors = TensorDict()

        # policy inputs
        obs_dict = TensorDict()
        self._tensors['obs'] = obs_dict
        if isinstance(obs_space, spaces.Dict):
            for space_name, space in obs_space.spaces.items():
                obs_dict[space_name] = self.init_tensor(
                    space.dtype, space.shape)
        else:
            raise Exception('Only Dict observations spaces are supported')

        # env outputs
        self._tensors['rewards'] = self.init_tensor(torch.float32, [1])
        self._tensors['rewards'].fill_(
            -42.42)  # if we're using uninitialized values it will be obvious
        self._tensors['dones'] = self.init_tensor(torch.bool, [1])
        self._tensors['dones'].fill_(True)
        self._tensors['policy_id'] = self.init_tensor(torch.int, [1])
        self._tensors['policy_id'].fill_(
            -1
        )  # -1 is an invalid policy index, experience from policy "-1" is always ignored

        # policy outputs
        policy_outputs = [('actions', num_actions),
                          ('action_logits', num_action_logits),
                          ('log_prob_actions', 1), ('values', 1),
                          ('policy_version', 1), ('rnn_states', hidden_size)]

        policy_outputs = [PolicyOutput(*po) for po in policy_outputs]
        policy_outputs = sorted(policy_outputs,
                                key=lambda policy_output: policy_output.name)

        for po in policy_outputs:
            self._tensors[po.name] = self.init_tensor(torch.float32, [po.size])

        ensure_memory_shared(self._tensors)

        # this is for performance optimization
        # indexing in numpy arrays is faster than in PyTorch tensors
        self.tensors = self.tensor_dict_to_numpy()

        # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a
        # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold
        # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers
        # in a proper format
        policy_outputs_combined_size = sum(po.size for po in policy_outputs)
        policy_outputs_shape = [
            self.cfg.num_workers,
            self.cfg.worker_num_splits,
            self.envs_per_split,
            self.num_agents,
            policy_outputs_combined_size,
        ]

        self.policy_outputs = policy_outputs
        self._policy_output_tensors = torch.zeros(policy_outputs_shape,
                                                  dtype=torch.float32)
        self._policy_output_tensors.share_memory_()
        self.policy_output_tensors = self._policy_output_tensors.numpy()

        self._policy_versions = torch.zeros([self.cfg.num_policies],
                                            dtype=torch.int32)
        self._policy_versions.share_memory_()
        self.policy_versions = self._policy_versions.numpy()

        # a list of boolean flags to be shared among components that indicate that experience collection should be
        # temporarily stopped (e.g. due to too much experience accumulated on the learner)
        self._stop_experience_collection = torch.ones([self.cfg.num_policies],
                                                      dtype=torch.bool)
        self._stop_experience_collection.share_memory_()
        self.stop_experience_collection = self._stop_experience_collection.numpy(
        )

        queue_max_size_bytes = self.num_traj_buffers * 40  # 40 bytes to encode an int should be enough?
        self.free_buffers_queue = faster_fifo.Queue(
            max_size_bytes=queue_max_size_bytes)

        # since all buffers are initially free, we add all buffer indices to the queue
        self.free_buffers_queue.put_many_nowait(
            [int(i) for i in np.arange(self.num_traj_buffers)])