Esempio n. 1
0
def set_gpus_for_process(process_idx, num_gpus_per_process, process_type, gpu_mask=None):
    available_gpus = get_available_gpus()
    if gpu_mask is not None:
        assert len(available_gpus) >= len(available_gpus)
        available_gpus = [available_gpus[g] for g in gpu_mask]
    num_gpus = len(available_gpus)
    gpus_to_use = []

    if num_gpus == 0:
        os.environ[CUDA_ENVVAR] = ''
        log.debug('Not using GPUs for %s process %d', process_type, process_idx)
    else:
        first_gpu_idx = process_idx * num_gpus_per_process
        for i in range(num_gpus_per_process):
            index_mod_num_gpus = (first_gpu_idx + i) % num_gpus
            gpus_to_use.append(available_gpus[index_mod_num_gpus])

        os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use])
        log.info(
            'Set environment var %s to %r for %s process %d',
            CUDA_ENVVAR, os.environ[CUDA_ENVVAR], process_type, process_idx,
        )
        log.debug('Visible devices: %r', torch.cuda.device_count())

    return gpus_to_use
Esempio n. 2
0
    def cat(self, dict_of_tensor_arrays, macro_batch_size, use_pinned_memory, timing):
        """
        Here 'macro_batch' is the overall size of experience per iteration.
        Macro-batch = mini-batch * num_batches_per_iteration
        """

        tensor_batch = self.batch_pool.get()

        if tensor_batch is not None:
            old_batch_size = tensor_batch_size(tensor_batch)
            if old_batch_size != macro_batch_size:
                # this can happen due to PBT changing batch size during the experiment
                log.warning('Tensor macro-batch size changed from %d to %d!', old_batch_size, macro_batch_size)
                log.warning('Discarding the cached tensor batch!')
                del tensor_batch
                tensor_batch = None

        if tensor_batch is None:
            tensor_batch = copy_dict_structure(dict_of_tensor_arrays)
            log.info('Allocating new CPU tensor batch (could not get from the pool)')

            for d1, cache_d, key, tensor_arr, _ in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch):
                cache_d[key] = torch.cat(tensor_arr, dim=0)
                if use_pinned_memory:
                    cache_d[key] = cache_d[key].pin_memory()
        else:
            with timing.add_time('batcher_mem'):
                for d1, cache_d, key, tensor_arr, cache_t in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch):
                    offset = 0
                    for t in tensor_arr:
                        first_dim = t.shape[0]
                        cache_t[offset:offset + first_dim].copy_(t)
                        offset += first_dim

        return tensor_batch
Esempio n. 3
0
    def _parse_info(self, info, done):
        if self.reward_shaping_scheme is None:
            # skip reward calculation
            return 0.0

        # by default these are negative values if no weapon is selected
        selected_weapon = info.get('SELECTED_WEAPON', 0.0)
        selected_weapon = int(max(0, selected_weapon))
        selected_weapon_ammo = float(max(0.0, info.get('SELECTED_WEAPON_AMMO', 0.0)))
        self.selected_weapon.append(selected_weapon)

        was_dead = self.prev_dead
        is_alive = not info.get('DEAD', 0.0)
        just_respawned = was_dead and is_alive

        shaping_reward = 0.0
        if not done and not just_respawned:
            shaping_reward, deltas = self._delta_rewards(info)

            shaping_reward += self._selected_weapon_rewards(
                selected_weapon, selected_weapon_ammo, deltas,
            )

            if abs(shaping_reward) > 2.5 and not self.print_once:
                log.info('Large shaping reward %.3f for %r', shaping_reward, deltas)
                self.print_once = True

        if done and 'FRAGCOUNT' in self.reward_structure:
            sorted_rew = sorted(self.reward_structure.items(), key=operator.itemgetter(1))
            sum_rew = sum(r for key, r in sorted_rew)
            sorted_rew = {key: f'{r:.3f}' for key, r in sorted_rew}
            log.info('Sum rewards: %.3f, reward structure: %r', sum_rew, sorted_rew)

        return shaping_reward
Esempio n. 4
0
    def reset(self):
        if self._episode_recording_dir is not None and self._record_id > 0:
            # save actions to text file
            with open(join(self._episode_recording_dir, 'actions.json'),
                      'w') as actions_file:
                json.dump(self._recorded_actions, actions_file)

            # rename previous episode dir
            reward = self._recorded_episode_reward + self._recorded_episode_shaping_reward
            new_dir_name = self._episode_recording_dir + f'_r{reward:.2f}'
            os.rename(self._episode_recording_dir, new_dir_name)
            log.info(
                'Finished recording %s (rew %.3f, shaping %.3f)',
                new_dir_name,
                reward,
                self._recorded_episode_shaping_reward,
            )

        dir_name = f'ep_{self._record_id:03d}_p{self._player_id}'
        self._episode_recording_dir = join(self._record_to, dir_name)
        ensure_dir_exists(self._episode_recording_dir)

        self._record_id += 1
        self._frame_id = 0
        self._recorded_episode_reward = 0
        self._recorded_episode_shaping_reward = 0

        self._recorded_actions = []

        return self.env.reset()
Esempio n. 5
0
    def _init(self):
        """
        Initialize env runners, that actually do all the work. Also we're doing some utility stuff here, e.g.
        setting process affinity (this is a performance optimization).
        """

        log.info('Initializing envs for env runner %d...', self.worker_idx)

        if self.cfg.force_envs_single_thread:
            from threadpoolctl import threadpool_limits
            threadpool_limits(limits=1, user_api=None)

        if self.cfg.set_workers_cpu_affinity:
            set_process_cpu_affinity(self.worker_idx, self.cfg.num_workers)
        psutil.Process().nice(min(self.cfg.default_niceness + 10, 20))

        self.env_runners = []
        for split_idx in range(self.num_splits):
            env_runner = VectorEnvRunner(
                self.cfg,
                self.vector_size // self.num_splits,
                self.worker_idx,
                split_idx,
                self.num_agents,
                self.shared_buffers,
                self.reward_shaping,
            )
            env_runner.init()
            self.env_runners.append(env_runner)
Esempio n. 6
0
    def calc_num_trajectory_buffers(self):
        """
        This calculates the number of shared trajectory (rollout) buffers required by the system to operate
        without interruptions.
        This consists of:
        1) at least one trajectory buffer per agent, such that we always have a location to save new experience
        2) a few trajectory buffers to hold data currently processed by the learner (including potential backlog)
        3) (potentially) some extra trajectory buffers to keep the system operational. These might be required
        i.e. in multi-agent envs when some agents are deactivated during the rollout. Such agents together with the
        learner can hold on to many buffers, such that active agents might not have enough free buffers to continue
        collecting the experience.
        """

        # Add a traj buffer for each agent
        num_traj_buffers = (self.cfg.num_workers +
                            1) * self.cfg.num_envs_per_worker * self.num_agents

        max_minibatches_to_accumulate = self.cfg.num_minibatches_to_accumulate
        if max_minibatches_to_accumulate == -1:
            # default value
            max_minibatches_to_accumulate = 2 * self.cfg.num_batches_per_iteration

        # Let each learner accumulate enough full sets of experience to pause learning
        max_experience_on_learners = max_minibatches_to_accumulate * self.cfg.batch_size * self.cfg.num_policies
        num_traj_buffers += max_experience_on_learners / self.cfg.rollout

        # Configurable excess ratio to be safe
        assert self.cfg.traj_buffers_excess_ratio >= 1.0
        num_traj_buffers = self.cfg.traj_buffers_excess_ratio * num_traj_buffers

        num_traj_buffers = int(math.ceil(num_traj_buffers))

        log.info('Using a total of %d trajectory buffers', num_traj_buffers)
        return num_traj_buffers
Esempio n. 7
0
 def _init(self, init_info):
     log.info('Initializing env for player %d, init_info: %r...',
              self.player_id, init_info)
     env = init_multiplayer_env(self.make_env_func, self.player_id,
                                self.env_config, init_info)
     if self.reset_on_init:
         env.reset()
     return env
Esempio n. 8
0
 def _set_game_mode(self, mode):
     if mode == 'replay':
         self.game.set_mode(Mode.PLAYER)
     else:
         if self.async_mode:
             log.info('Starting in async mode! Use this only for testing, otherwise PLAYER mode is much faster')
             self.game.set_mode(Mode.ASYNC_PLAYER)
         else:
             self.game.set_mode(Mode.PLAYER)
Esempio n. 9
0
    def _vizdoom_variables_bug_workaround(self, info, done):
        """Some variables don't get reset to zero on game.new_episode(). This fixes it (also check overflow?)."""
        if done and 'DAMAGECOUNT' in info:
            log.info('DAMAGECOUNT value on done: %r', info.get('DAMAGECOUNT'))

        if self._last_episode_info is not None:
            bugged_vars = ['DEATHCOUNT', 'HITCOUNT', 'DAMAGECOUNT']
            for v in bugged_vars:
                if v in info:
                    info[v] -= self._last_episode_info.get(v, 0)
Esempio n. 10
0
def run(cfg):
    cfg = maybe_load_from_checkpoint(cfg)

    algo = DmlabLevelGenerator(cfg)
    algo.initialize()
    status = algo.run()
    algo.finalize()

    log.info('Exit...')
    return status
Esempio n. 11
0
def main():
    env_name = 'doom_battle'
    env = create_env(env_name, cfg=default_cfg(env=env_name))

    env.reset()
    done = False
    while not done:
        env.render()
        obs, rew, done, info = env.step(env.action_space.sample())

    log.info('Done!')
Esempio n. 12
0
    def test_minigrid_env(self):
        env_name = 'MiniGrid-Empty-Random-5x5-v0'
        env = create_env(env_name, cfg=default_cfg(env=env_name))
        log.info('Env action space: %r', env.action_space)
        log.info('Env obs space: %r', env.observation_space)

        env.reset()
        total_rew = 0
        for i in range(1000):
            obs, rew, done, info = env.step(env.action_space.sample())
            total_rew += rew
            if done:
                env.reset()
Esempio n. 13
0
    def _handle_reset(self):
        """
        Reset all envs, one split at a time (double-buffering), and send requests to policy workers to get
        actions for the very first env step.
        """
        for split_idx, env_runner in enumerate(self.env_runners):
            policy_inputs = env_runner.reset(self.report_queue)
            self._enqueue_policy_request(split_idx, policy_inputs)

        log.info('Finished reset for worker %d', self.worker_idx)
        safe_put(self.report_queue,
                 dict(finished_reset=self.worker_idx),
                 queue_name='report')
Esempio n. 14
0
    def test_doom_multiagent_parallel(self):
        num_workers = 16
        workers = []

        for i in range(num_workers):
            log.info('Starting worker #%d', i)
            worker = Process(target=self.doom_multiagent,
                             args=(self.make_standard_dm, i, 200))
            worker.start()
            workers.append(worker)
            time.sleep(0.01)

        for i in range(num_workers):
            workers[i].join()
Esempio n. 15
0
    def __init__(self, env, initial_difficulty=None):
        super().__init__(env)

        self._min_difficulty = 0
        self._max_difficulty = 150
        self._difficulty_step = 10
        self._curr_difficulty = 20 if initial_difficulty is None else initial_difficulty
        self._difficulty_std = 10

        log.info('Starting with bot difficulty %d', self._curr_difficulty)

        self._adaptive_curriculum = True
        if initial_difficulty == self._max_difficulty:
            log.debug('Starting at max difficulty, disable adaptive skill curriculum')
            self._adaptive_curriculum = False
Esempio n. 16
0
    def calc_num_trajectory_buffers(self):
        # calculate how many buffers are required per env runner to collect one "macro batch" for training
        # once macro batch is collected, all buffers will be released
        # we could have just copied the tensors on the learner to avoid this complicated logic, but it's better for
        # performance to keep data in shared buffers until they're needed
        samples_per_iteration = self.cfg.num_batches_per_iteration * self.cfg.batch_size * self.cfg.num_policies
        num_traj_buffers = samples_per_iteration / (self.cfg.num_workers * self.cfg.num_envs_per_worker * self.num_agents * self.cfg.rollout)

        # make sure we definitely have enough buffers to actually never wait
        # usually it'll be just two buffers and we swap back and forth
        num_traj_buffers *= 3

        # make sure we have at least two to swap between so we never actually have to wait
        num_traj_buffers = math.ceil(max(num_traj_buffers, self.cfg.min_traj_buffers_per_worker))
        log.info('Using %d sets of trajectory buffers', num_traj_buffers)
        return num_traj_buffers
Esempio n. 17
0
    def _update_weights(self, timing):
        learner_policy_version = self.policy_versions[self.policy_id].item()
        if self.latest_policy_version < learner_policy_version and self.shared_model_weights is not None:
            with timing.timeit('weight_update'):
                with self.policy_lock:
                    self.actor_critic.load_state_dict(self.shared_model_weights)

            self.latest_policy_version = learner_policy_version

            if self.num_policy_updates % 10 == 0:
                log.info(
                    'Updated weights on worker %d-%d, policy_version %d (%.5f)',
                    self.policy_id, self.worker_idx, self.latest_policy_version, timing.weight_update,
                )

            self.num_policy_updates += 1
Esempio n. 18
0
def dmlab_ensure_global_cache_initialized(experiment_dir,
                                          all_levels_for_experiment,
                                          num_policies, level_cache_dir):
    global DMLAB_GLOBAL_LEVEL_CACHE

    assert multiprocessing.current_process().name == 'MainProcess', \
        'make sure you initialize DMLab cache before child processes are forked'

    DMLAB_GLOBAL_LEVEL_CACHE = []
    for policy_id in range(num_policies):
        # level cache is of course shared between independently training policies
        # it's easiest to achieve

        log.info('Initializing level cache for policy %d...', policy_id)
        cache = DmlabLevelCacheGlobal(level_cache_dir, experiment_dir,
                                      all_levels_for_experiment, policy_id)
        DMLAB_GLOBAL_LEVEL_CACHE.append(cache)
Esempio n. 19
0
    def replay(env, rec_path):
        doom = env.unwrapped
        doom.mode = 'replay'
        doom._ensure_initialized()
        doom.game.replay_episode(rec_path)

        episode_reward = 0
        start = time.time()

        while not doom.game.is_episode_finished():
            doom.game.advance_action()
            r = doom.game.get_last_reward()
            episode_reward += r
            log.info('Episode reward: %.3f, time so far: %.1f s', episode_reward, time.time() - start)

        log.info('Finishing replay')
        doom.close()
Esempio n. 20
0
    def __init__(
        self, worker_idx, policy_id, cfg, obs_space, action_space, shared_buffers, policy_queue, actor_queues,
        report_queue, task_queue, policy_lock, resume_experience_collection_cv
    ):
        log.info('Initializing policy worker %d for policy %d', worker_idx, policy_id)

        self.worker_idx = worker_idx
        self.policy_id = policy_id
        self.cfg = cfg

        self.obs_space = obs_space
        self.action_space = action_space

        self.device = None
        self.actor_critic = None
        self.shared_model_weights = None
        self.policy_lock = policy_lock
        self.resume_experience_collection_cv = resume_experience_collection_cv

        self.policy_queue = policy_queue
        self.actor_queues = actor_queues
        self.report_queue = report_queue

        # queue other components use to talk to this particular worker
        self.task_queue = task_queue

        self.initialized = False
        self.terminate = False
        self.initialized_event = multiprocessing.Event()
        self.initialized_event.clear()

        self.shared_buffers = shared_buffers
        self.tensors_individual_transitions = self.shared_buffers.tensors_individual_transitions
        self.policy_versions = shared_buffers.policy_versions
        self.stop_experience_collection = shared_buffers.stop_experience_collection

        self.latest_policy_version = -1
        self.num_policy_updates = 0

        self.requests = []

        self.total_num_samples = 0

        self.process = TorchProcess(target=self._run, daemon=True)
Esempio n. 21
0
    def reset(self, report_queue):
        """
        Do the very first reset for all environments in a vector. Populate shared memory with initial obs.
        Note that this is called only once, at the very beginning of training. After this the envs should auto-reset.

        :param report_queue: we use report queue to monitor reset progress (see appo.py). This can be a lengthy
        process.
        :return: first requests for policy workers (to generate actions for the very first env step)
        """

        for env_i, e in enumerate(self.envs):
            observations = e.reset()

            if self.cfg.decorrelate_envs_on_one_worker:
                env_i_split = self.num_envs * self.split_idx + env_i
                decorrelate_steps = self.cfg.rollout * env_i_split + self.cfg.rollout * random.randint(
                    0, 4)

                log.info('Decorrelating experience for %d frames...',
                         decorrelate_steps)
                for decorrelate_step in range(decorrelate_steps):
                    actions = [
                        e.action_space.sample() for _ in range(self.num_agents)
                    ]
                    observations, rew, dones, info = e.step(actions)

            for agent_i, obs in enumerate(observations):
                actor_state = self.actor_states[env_i][agent_i]
                actor_state.set_trajectory_data(dict(obs=obs),
                                                self.traj_buffer_idx,
                                                self.rollout_step)
                # rnn state is already initialized at zero

            # log.debug(
            #     'Reset progress w:%d-%d finished %d/%d, still initializing envs...',
            #     self.worker_idx, self.split_idx, env_i + 1, len(self.envs),
            # )
            safe_put(report_queue,
                     dict(initialized_env=(self.worker_idx, self.split_idx,
                                           env_i)),
                     queue_name='report')

        policy_request = self._format_policy_request()
        return policy_request
Esempio n. 22
0
    def __init__(self, num_agents, make_env_func, env_config, skip_frames):
        gym.Env.__init__(self)
        RewardShapingInterface.__init__(self)

        self.num_agents = num_agents
        log.debug('Multi agent env, num agents: %d', self.num_agents)
        self.skip_frames = skip_frames  # number of frames to skip (1 = no skip)

        env = make_env_func(
            player_id=-1
        )  # temporary env just to query observation_space and stuff
        self.action_space = env.action_space
        self.observation_space = env.observation_space

        self.default_reward_shaping = get_default_reward_shaping(env)
        env.close()

        self.current_reward_shaping = [
            self.default_reward_shaping for _ in range(self.num_agents)
        ]

        self.make_env_func = make_env_func

        self.safe_init = env_config is not None and env_config.get(
            'safe_init', False)

        if self.safe_init:
            sleep_seconds = env_config.worker_index * 1.0
            log.info(
                'Sleeping %.3f seconds to avoid creating all envs at once',
                sleep_seconds)
            time.sleep(sleep_seconds)
            log.info('Done sleeping at %d', env_config.worker_index)

        self.env_config = env_config
        self.workers = None

        # only needed when rendering
        self.enable_rendering = False
        self.last_obs = None

        self.reset_on_init = True

        self.initialized = False
Esempio n. 23
0
    def _advance_rollouts(self, data, timing):
        """
        Process incoming request from policy worker. Use the data (policy outputs, actions) to advance the simulation
        by one step on the corresponding VectorEnvRunner.

        If we successfully managed to advance the simulation, send requests to policy workers to get actions for the
        next step. If we completed the entire rollout, also send request to the learner!

        :param data: request from the policy worker, containing actions and other policy outputs
        :param timing: profiling stuff
        """
        split_idx = data['split_idx']

        runner = self.env_runners[split_idx]
        policy_request, complete_rollouts, episodic_stats = runner.advance_rollouts(
            data, timing)

        with timing.add_time('complete_rollouts'):
            if complete_rollouts:
                self._enqueue_complete_rollouts(split_idx, complete_rollouts)

                if self.num_complete_rollouts == 0 and not self.cfg.benchmark:
                    # we just finished our first complete rollouts, perfect time to wait for experience derorrelation
                    # this guarantees that there won't be any "old" trajectories when we awaken
                    delay = (float(self.worker_idx) / self.cfg.num_workers
                             ) * self.cfg.decorrelate_experience_max_seconds
                    log.info(
                        'Worker %d, sleep for %.3f sec to decorrelate experience collection',
                        self.worker_idx,
                        delay,
                    )
                    time.sleep(delay)
                    log.info('Worker %d awakens!', self.worker_idx)

                self.num_complete_rollouts += len(
                    complete_rollouts['rollouts'])

        with timing.add_time('enqueue_policy_requests'):
            if policy_request is not None:
                self._enqueue_policy_request(split_idx, policy_request)

        if episodic_stats:
            self._report_stats(episodic_stats)
Esempio n. 24
0
def test_env_performance(make_env, env_type, verbose=False):
    t = Timing()
    with t.timeit('init'):
        env = make_env(AttrDict({'worker_index': 0, 'vector_index': 0}))
        total_num_frames, frames = 10000, 0

    with t.timeit('first_reset'):
        env.reset()

    t.reset = t.step = 1e-9
    num_resets = 0
    with t.timeit('experience'):
        while frames < total_num_frames:
            done = False

            start_reset = time.time()
            env.reset()

            t.reset += time.time() - start_reset
            num_resets += 1

            while not done and frames < total_num_frames:
                start_step = time.time()
                if verbose:
                    env.render()
                    time.sleep(1.0 / 40)

                obs, rew, done, info = env.step(env.action_space.sample())
                if verbose:
                    log.info('Received reward %.3f', rew)

                t.step += time.time() - start_step
                frames += num_env_steps([info])

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Avg. reset time %.3f s', t.reset / num_resets)
    log.debug('Timing: %s', t)
    env.close()
Esempio n. 25
0
    def report(self, env_frames):
        now = time.time()
        self.last_report = now

        self.fps_stats.append((now, env_frames))
        if len(self.fps_stats) <= 1:
            return

        fps = []
        for avg_interval in self.avg_stats_intervals:
            past_moment, past_frames = self.fps_stats[max(
                0,
                len(self.fps_stats) - 1 - avg_interval)]
            fps.append((env_frames - past_frames) / (now - past_moment))

        fps_str = []
        for interval, fps_value in zip(self.avg_stats_intervals, fps):
            fps_str.append(
                f'{int(interval * self.report_every_sec)} sec: {fps_value:.1f}'
            )
        fps_str = f'({", ".join(fps_str)})'
        log.info('Sampling FPS: %s. Total frames collected: %d', fps_str,
                 env_frames)
Esempio n. 26
0
    def step(self, action):
        obs, rew, done, info = self.env.step(action)
        if obs is None:
            return obs, rew, done, info

        self.orig_env_reward += rew

        shaping_rew = self._parse_info(info, done)
        rew += shaping_rew
        self.total_shaping_reward += shaping_rew

        if self.verbose:
            log.info('Original env reward before shaping: %.3f', self.orig_env_reward)
            player_id = 1
            if hasattr(self.env.unwrapped, 'player_id'):
                player_id = self.env.unwrapped.player_id

            log.info(
                'Total shaping reward is %.3f for %d (done %d)',
                self.total_shaping_reward, player_id, done,
            )

        # remember new variable values
        for var_name in self.reward_shaping_scheme['delta'].keys():
            self.prev_vars[var_name] = info.get(var_name, 0.0)

        self.prev_dead = not not info.get('DEAD', 0.0)  # float -> bool

        if done:
            if self.true_reward_func is None:
                true_reward = self.orig_env_reward
            else:
                true_reward = self.true_reward_func(info)

            info['true_reward'] = true_reward

        return obs, rew, done, info
Esempio n. 27
0
    def doom_multiagent(make_multi_env, worker_index, num_steps=1000):
        env_config = AttrDict({
            'worker_index': worker_index,
            'vector_index': 0,
            'safe_init': False
        })
        multi_env = make_multi_env(env_config)

        obs = multi_env.reset()

        visualize = False
        start = time.time()

        for i in range(num_steps):
            actions = [multi_env.action_space.sample()] * len(obs)
            obs, rew, dones, infos = multi_env.step(actions)

            if visualize:
                multi_env.render()

            if i % 100 == 0 or any(dones):
                log.info('Rew %r done %r info %r', rew, dones, infos)

            if all(dones):
                multi_env.reset()

        took = time.time() - start
        log.info('Took %.3f seconds for %d steps', took, num_steps)
        log.info('Server steps per second: %.1f', num_steps / took)
        log.info('Observations fps: %.1f',
                 num_steps * multi_env.num_agents / took)
        log.info(
            'Environment fps: %.1f',
            num_steps * multi_env.num_agents * multi_env.skip_frames / took)

        multi_env.close()
Esempio n. 28
0
    def test_voxel_env(self):
        env_name = 'voxel_env_Sokoban'
        env = create_env(env_name, cfg=default_cfg(env=env_name))
        log.info('Env action space: %r', env.action_space)
        log.info('Env obs space: %r', env.observation_space)

        env.reset()
        total_rew = 0
        for i in range(1000):
            obs, rew, done, info = env.step(
                [env.action_space.sample() for _ in range(env.num_agents)])
            total_rew += sum(rew)

        log.info('Total rew: %.3f', total_rew)
Esempio n. 29
0
def run(run_description, args):
    experiments = run_description.experiments
    max_parallel = args.max_parallel

    log.info('Starting processes with base cmds: %r',
             [e.cmd for e in experiments])
    log.info('Max parallel processes is %d', max_parallel)
    log.info(
        'Monitor log files using\n\n\ttail -f train_dir/%s/**/**/sf_log.txt\n\n',
        run_description.run_name)

    processes = []
    processes_per_gpu = {g: [] for g in range(args.num_gpus)}

    experiments = run_description.generate_experiments(args.train_dir)
    next_experiment = next(experiments, None)

    def find_least_busy_gpu():
        least_busy_gpu = None
        gpu_available_processes = 0

        for gpu_id in range(args.num_gpus):
            available_processes = args.experiments_per_gpu - len(
                processes_per_gpu[gpu_id])
            if available_processes > gpu_available_processes:
                gpu_available_processes = available_processes
                least_busy_gpu = gpu_id

        return least_busy_gpu, gpu_available_processes

    def can_squeeze_another_process():
        if len(processes) >= max_parallel:
            return False

        if args.experiments_per_gpu > 0:
            least_busy_gpu, gpu_available_processes = find_least_busy_gpu()
            if gpu_available_processes <= 0:
                return False

        return True

    failed_processes = []
    last_log_time = 0
    log_interval = 3  # seconds

    while len(processes) > 0 or next_experiment is not None:
        while can_squeeze_another_process() and next_experiment is not None:
            cmd, name, root_dir, exp_env_vars = next_experiment

            cmd_tokens = cmd.split(' ')

            # workaround to make sure we're running the correct python executable from our virtual env
            if cmd_tokens[0].startswith('python'):
                cmd_tokens[0] = sys.executable
                log.debug('Using Python executable %s', cmd_tokens[0])

            ensure_dir_exists(join(args.train_dir, root_dir))

            envvars = os.environ.copy()

            best_gpu = None
            if args.experiments_per_gpu > 0:
                best_gpu, best_gpu_available_processes = find_least_busy_gpu()
                log.info(
                    'The least busy gpu is %d where we can run %d more processes',
                    best_gpu,
                    best_gpu_available_processes,
                )
                envvars['CUDA_VISIBLE_DEVICES'] = f'{best_gpu}'

            log.info('Starting process %r', cmd_tokens)

            if exp_env_vars is not None:
                for key, value in exp_env_vars.items():
                    log.info('Adding env variable %r %r', key, value)
                    envvars[str(key)] = str(value)

            process = subprocess.Popen(cmd_tokens,
                                       stdout=None,
                                       stderr=None,
                                       env=envvars)
            process.gpu_id = best_gpu
            process.proc_cmd = cmd

            processes.append(process)

            if process.gpu_id is not None:
                processes_per_gpu[process.gpu_id].append(process.proc_cmd)

            log.info('Started process %s on GPU %r', process.proc_cmd,
                     process.gpu_id)
            log.info('Waiting for %d seconds before starting next process',
                     args.pause_between)
            time.sleep(args.pause_between)

            next_experiment = next(experiments, None)

        remaining_processes = []
        for process in processes:
            if process.poll() is None:
                remaining_processes.append(process)
                continue
            else:
                if process.gpu_id is not None:
                    processes_per_gpu[process.gpu_id].remove(process.proc_cmd)
                log.info('Process %r finished with code %r', process.proc_cmd,
                         process.returncode)
                if process.returncode != 0:
                    failed_processes.append(
                        (process.proc_cmd, process.pid, process.returncode))
                    log.error('WARNING: RETURN CODE IS %r', process.returncode)

        processes = remaining_processes

        if time.time() - last_log_time > log_interval:
            if failed_processes:
                log.error(
                    'Failed processes: %s', ', '.join([
                        f'PID: {p[1]} code: {p[2]}' for p in failed_processes
                    ]))
            last_log_time = time.time()

        time.sleep(0.1)

    log.info('Done!')

    return 0
Esempio n. 30
0
def enjoy(cfg, max_num_frames=1e9):
    cfg = load_from_checkpoint(cfg)

    render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip
    if render_action_repeat is None:
        log.warning('Not using action repeat!')
        render_action_repeat = 1
    log.debug('Using action repeat %d during evaluation', render_action_repeat)

    cfg.env_frameskip = 1  # for evaluation
    cfg.num_envs = 1

    def make_env_func(env_config):
        return create_env(cfg.env, cfg=cfg, env_config=env_config)

    env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0}))
    # env.seed(0)

    is_multiagent = is_multiagent_env(env)
    if not is_multiagent:
        env = MultiAgentWrapper(env)

    if hasattr(env.unwrapped, 'reset_on_init'):
        # reset call ruins the demo recording for VizDoom
        env.unwrapped.reset_on_init = False

    actor_critic = create_actor_critic(cfg, env.observation_space,
                                       env.action_space)

    device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda')
    actor_critic.model_to_device(device)

    policy_id = cfg.policy_index
    checkpoints = LearnerWorker.get_checkpoints(
        LearnerWorker.checkpoint_dir(cfg, policy_id))
    checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device)
    actor_critic.load_state_dict(checkpoint_dict['model'])

    episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)]
    true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)]
    num_frames = 0

    last_render_start = time.time()

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    obs = env.reset()
    rnn_states = torch.zeros(
        [env.num_agents, get_hidden_size(cfg)],
        dtype=torch.float32,
        device=device)
    episode_reward = np.zeros(env.num_agents)
    finished_episode = [False] * env.num_agents

    with torch.no_grad():
        while not max_frames_reached(num_frames):
            obs_torch = AttrDict(transform_dict_observations(obs))
            for key, x in obs_torch.items():
                obs_torch[key] = torch.from_numpy(x).to(device).float()

            policy_outputs = actor_critic(obs_torch,
                                          rnn_states,
                                          with_action_distribution=True)

            # sample actions from the distribution by default
            actions = policy_outputs.actions

            action_distribution = policy_outputs.action_distribution
            if isinstance(action_distribution, ContinuousActionDistribution):
                if not cfg.continuous_actions_sample:  # TODO: add similar option for discrete actions
                    actions = action_distribution.means

            actions = actions.cpu().numpy()

            rnn_states = policy_outputs.rnn_states

            for _ in range(render_action_repeat):
                if not cfg.no_render:
                    target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0
                    current_delay = time.time() - last_render_start
                    time_wait = target_delay - current_delay

                    if time_wait > 0:
                        # log.info('Wait time %.3f', time_wait)
                        time.sleep(time_wait)

                    last_render_start = time.time()
                    env.render()

                obs, rew, done, infos = env.step(actions)

                episode_reward += rew
                num_frames += 1

                for agent_i, done_flag in enumerate(done):
                    if done_flag:
                        finished_episode[agent_i] = True
                        episode_rewards[agent_i].append(
                            episode_reward[agent_i])
                        true_rewards[agent_i].append(infos[agent_i].get(
                            'true_reward', episode_reward[agent_i]))
                        log.info(
                            'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f',
                            agent_i, num_frames, episode_reward[agent_i],
                            true_rewards[agent_i][-1])
                        rnn_states[agent_i] = torch.zeros(
                            [get_hidden_size(cfg)],
                            dtype=torch.float32,
                            device=device)
                        episode_reward[agent_i] = 0

                # if episode terminated synchronously for all agents, pause a bit before starting a new one
                if all(done):
                    if not cfg.no_render:
                        env.render()
                    time.sleep(0.05)

                if all(finished_episode):
                    finished_episode = [False] * env.num_agents
                    avg_episode_rewards_str, avg_true_reward_str = '', ''
                    for agent_i in range(env.num_agents):
                        avg_rew = np.mean(episode_rewards[agent_i])
                        avg_true_rew = np.mean(true_rewards[agent_i])
                        if not np.isnan(avg_rew):
                            if avg_episode_rewards_str:
                                avg_episode_rewards_str += ', '
                            avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}'
                        if not np.isnan(avg_true_rew):
                            if avg_true_reward_str:
                                avg_true_reward_str += ', '
                            avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}'

                    log.info('Avg episode rewards: %s, true rewards: %s',
                             avg_episode_rewards_str, avg_true_reward_str)
                    log.info(
                        'Avg episode reward: %.3f, avg true_reward: %.3f',
                        np.mean([
                            np.mean(episode_rewards[i])
                            for i in range(env.num_agents)
                        ]),
                        np.mean([
                            np.mean(true_rewards[i])
                            for i in range(env.num_agents)
                        ]))

                # VizDoom multiplayer stuff
                # for player in [1, 2, 3, 4, 5, 6, 7, 8]:
                #     key = f'PLAYER{player}_FRAGCOUNT'
                #     if key in infos[0]:
                #         log.debug('Score for player %d: %r', player, infos[0][key])

    env.close()

    return ExperimentStatus.SUCCESS, np.mean(episode_rewards)