def set_gpus_for_process(process_idx, num_gpus_per_process, process_type, gpu_mask=None): available_gpus = get_available_gpus() if gpu_mask is not None: assert len(available_gpus) >= len(available_gpus) available_gpus = [available_gpus[g] for g in gpu_mask] num_gpus = len(available_gpus) gpus_to_use = [] if num_gpus == 0: os.environ[CUDA_ENVVAR] = '' log.debug('Not using GPUs for %s process %d', process_type, process_idx) else: first_gpu_idx = process_idx * num_gpus_per_process for i in range(num_gpus_per_process): index_mod_num_gpus = (first_gpu_idx + i) % num_gpus gpus_to_use.append(available_gpus[index_mod_num_gpus]) os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use]) log.info( 'Set environment var %s to %r for %s process %d', CUDA_ENVVAR, os.environ[CUDA_ENVVAR], process_type, process_idx, ) log.debug('Visible devices: %r', torch.cuda.device_count()) return gpus_to_use
def _game_init(self, with_locking=True, max_parallel=10): lock_file = lock = None if with_locking: lock_file = doom_lock_file(max_parallel) lock = FileLock(lock_file) init_attempt = 0 while True: init_attempt += 1 try: if with_locking: with lock.acquire(timeout=20): self.game.init() else: self.game.init() break except Timeout: if with_locking: log.debug( 'Another process currently holds the lock %s, attempt: %d', lock_file, init_attempt, ) except Exception as exc: log.warning( 'VizDoom game.init() threw an exception %r. Terminate process...', exc) from sample_factory.envs.env_utils import EnvCriticalError raise EnvCriticalError()
def find_available_port(start_port, increment=1000): port = start_port while port < 65535 and not is_udp_port_available(port): port += increment log.debug('Port %r is available', port) return port
def print_stats(self, fps, sample_throughput, total_env_steps): fps_str = [] for interval, fps_value in zip(self.avg_stats_intervals, fps): fps_str.append( f'{int(interval * self.report_interval)} sec: {fps_value:.1f}') fps_str = f'({", ".join(fps_str)})' samples_per_policy = ', '.join( [f'{p}: {s:.1f}' for p, s in sample_throughput.items()]) lag_stats = self.policy_lag[0] lag = AttrDict() for key in ['min', 'avg', 'max']: lag[key] = lag_stats.get(f'version_diff_{key}', -1) policy_lag_str = f'min: {lag.min:.1f}, avg: {lag.avg:.1f}, max: {lag.max:.1f}' log.debug( 'Fps is %s. Total num frames: %d. Throughput: %s. Samples: %d. Policy #0 lag: (%s)', fps_str, total_env_steps, samples_per_policy, sum(self.samples_collected), policy_lag_str, ) if 'reward' in self.policy_avg_stats: policy_reward_stats = [] for policy_id in range(self.cfg.num_policies): reward_stats = self.policy_avg_stats['reward'][policy_id] if len(reward_stats) > 0: policy_reward_stats.append( (policy_id, f'{np.mean(reward_stats):.3f}')) log.debug('Avg episode reward: %r', policy_reward_stats)
def _learner_load_model(self, policy_id, replacement_policy): log.debug('Asking learner %d to load model from %d', policy_id, replacement_policy) load_task = (PbtTask.LOAD_MODEL, (policy_id, replacement_policy)) learner_worker = self.learner_workers[policy_id] learner_worker.task_queue.put((TaskType.PBT, load_task))
def load_from_checkpoint(cfg): filename = cfg_file(cfg) if not os.path.isfile(filename): raise Exception( f'Could not load saved parameters for experiment {cfg.experiment}') with open(filename, 'r') as json_file: json_params = json.load(json_file) log.warning('Loading existing experiment configuration from %s', filename) loaded_cfg = AttrDict(json_params) # override the parameters in config file with values passed from command line for key, value in cfg.cli_args.items(): if key in loaded_cfg and loaded_cfg[key] != value: log.debug( 'Overriding arg %r with value %r passed from command line', key, value) loaded_cfg[key] = value # incorporate extra CLI parameters that were not present in JSON file for key, value in vars(cfg).items(): if key not in loaded_cfg: log.debug( 'Adding new argument %r=%r that is not in the saved config file!', key, value) loaded_cfg[key] = value return loaded_cfg
def _learner_update_cfg(self, policy_id): learner_worker = self.learner_workers[policy_id] log.debug('Sending learning configuration to learner %d...', policy_id) cfg_task = (PbtTask.UPDATE_CFG, (policy_id, self.policy_cfg[policy_id])) learner_worker.task_queue.put((TaskType.PBT, cfg_task))
def __init__(self, cfg, obs_space, timing): super().__init__(cfg, timing) obs_shape = get_obs_shape(obs_space) input_ch = obs_shape.obs[0] log.debug('Num input channels: %d', input_ch) if cfg.encoder_subtype == 'convnet_simple': conv_filters = [[input_ch, 32, 8, 4], [32, 64, 4, 2], [64, 128, 3, 2]] elif cfg.encoder_subtype == 'convnet_impala': conv_filters = [[input_ch, 16, 8, 4], [16, 32, 4, 2]] elif cfg.encoder_subtype == 'minigrid_convnet_tiny': conv_filters = [[3, 16, 3, 1], [16, 32, 2, 1], [32, 64, 2, 1]] else: raise NotImplementedError(f'Unknown encoder {cfg.encoder_subtype}') activation = nonlinearity(self.cfg) fc_layer_size = fc_after_encoder_size(self.cfg) encoder_extra_fc_layers = self.cfg.encoder_extra_fc_layers enc = self.ConvEncoderImpl(activation, conv_filters, fc_layer_size, encoder_extra_fc_layers, obs_shape) self.enc = torch.jit.script(enc) self.encoder_out_size = calc_num_elements(self.enc, obs_shape.obs) log.debug('Encoder output size: %r', self.encoder_out_size)
def _save_reward_shaping(self, policy_id): policy_reward_shaping_filename = policy_reward_shaping_file( self.cfg, policy_id) with open(policy_reward_shaping_filename, 'w') as json_file: log.debug('Saving policy-specific reward shaping %d to file %s', policy_id, policy_reward_shaping_filename) json.dump(self.policy_reward_shaping[policy_id], json_file)
def make_voxel_env(env_name, cfg=None, env_config=None, **kwargs): scenario_name = env_name.split('voxel_env_')[-1].casefold() log.debug('Using scenario %s', scenario_name) if 'multitask' in scenario_name: if env_config is not None and 'worker_index' in env_config: task_idx = env_config['worker_index'] else: log.warning('Could not find information about task id. Use task_id=0. (It is okay if this message appears once)') task_idx = 0 env = make_env_multitask( scenario_name, task_idx, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) else: env = VoxelEnv( scenario_name=scenario_name, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) env = Wrapper(env, cfg.voxel_increase_team_spirit, cfg.voxel_max_team_spirit_steps) return env
def __init__(self, cfg, obs_space, timing): super().__init__(cfg, timing) self.basic_encoder = create_standard_encoder(cfg, obs_space, timing) self.encoder_out_size = self.basic_encoder.encoder_out_size # same as IMPALA paper self.embedding_size = 20 self.instructions_lstm_units = 64 self.instructions_lstm_layers = 1 padding_idx = 0 self.word_embedding = nn.Embedding( num_embeddings=DMLAB_VOCABULARY_SIZE, embedding_dim=self.embedding_size, padding_idx=padding_idx ) self.instructions_lstm = nn.LSTM( input_size=self.embedding_size, hidden_size=self.instructions_lstm_units, num_layers=self.instructions_lstm_layers, batch_first=True, ) # learnable initial state? # initial_hidden_values = torch.normal(0, 1, size=(self.instructions_lstm_units, )) # self.lstm_h0 = nn.Parameter(initial_hidden_values, requires_grad=True) # self.lstm_c0 = nn.Parameter(initial_hidden_values, requires_grad=True) self.encoder_out_size += self.instructions_lstm_units log.debug('Policy head output size: %r', self.encoder_out_size) self.cpu_device = torch.device('cpu')
def step(self, actions): if self.skip_frames > 1 or self.num_agents == 1: # not used in multi-agent mode due to VizDoom limitations # this means that we have only one agent (+ maybe some bots, which is why we're in multiplayer mode) return super().step(actions) self._ensure_initialized() actions_binary = self._convert_actions(actions) self.game.set_action(actions_binary) self.game.advance_action(1, self.update_state) self.timestep += 1 if not self.update_state: return None, None, None, None state = self.game.get_state() reward = self.game.get_last_reward() done = self.game.is_episode_finished() if self.record_to is not None: # send 'stop recording' command 1 tick before the end of the episode # otherwise it does not get saved to disk if self.game.get_episode_time( ) + 1 == self.game.get_episode_timeout(): log.debug('Calling stop recording command!') self.game.send_game_command('stop') observation, done, info = self._process_game_step(state, done, {}) return observation, reward, done, info
def _perturb_param(self, param, param_name, default_param): # toss a coin whether we perturb the parameter at all if random.random() > self.cfg.pbt_mutation_rate: return param if param != default_param and random.random() < 0.05: # small chance to replace parameter with a default value log.debug('%s changed to default value %r', param_name, default_param) return default_param if param_name in SPECIAL_PERTURBATION: new_value = SPECIAL_PERTURBATION[param_name](param, self.cfg) elif type(param) is bool: new_value = not param elif isinstance(param, numbers.Number): perturb_amount = random.uniform(1.01, 1.5) new_value = perturb_float(float(param), perturb_amount=perturb_amount) else: raise RuntimeError('Unsupported parameter type') log.debug('Param %s changed from %.6f to %.6f', param_name, param, new_value) return new_value
def register_custom_encoder(custom_encoder_name, encoder_cls): assert issubclass( encoder_cls, EncoderBase), 'Custom encoders must be derived from EncoderBase' assert custom_encoder_name not in ENCODER_REGISTRY log.debug('Adding model class %r to registry (with name %s)', encoder_cls, custom_encoder_name) ENCODER_REGISTRY[custom_encoder_name] = encoder_cls
def finish_initialization(self): """Wait until policy workers are fully initialized.""" for policy_id, workers in self.policy_workers.items(): for w in workers: log.debug( 'Waiting for policy worker %d-%d to finish initialization...', policy_id, w.worker_idx) w.init() log.debug('Policy worker %d-%d initialized!', policy_id, w.worker_idx)
def register_additional_doom_env(doom_spec): try: spec = doom_env_by_name(doom_spec.name) log.error('Doom env spec %s already exists', spec.name) return except RuntimeError: pass log.debug('Registering Doom environment %s...', doom_spec.name) DOOM_ENVS.append(doom_spec)
def finalize(self): try: self.report_queue.get_many_nowait() except Empty: pass log.debug('Joining worker processes...') for p in self.processes: p.join() log.debug('Done joining!')
def register_custom_encoder(custom_encoder_name, encoder_cls): if custom_encoder_name in ENCODER_REGISTRY: log.warning('Encoder %s already registered', custom_encoder_name) assert issubclass( encoder_cls, EncoderBase), 'Custom encoders must be derived from EncoderBase' log.debug('Adding model class %r to registry (with name %s)', encoder_cls, custom_encoder_name) ENCODER_REGISTRY[custom_encoder_name] = encoder_cls
def dbg_print(self): dbg_info = dict( entropy=self.entropy().mean(), min_logit=self.raw_logits.min(), max_logit=self.raw_logits.max(), min_prob=self.probs.min(), max_prob=self.probs.max(), ) msg = '' for key, value in dbg_info.items(): msg += f'{key}={value.cpu().item():.3f} ' log.debug(msg)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', type=str, default=None, required=True) parser.add_argument('--demo_path', type=str, default=None, required=True) args = parser.parse_args() spec = doom_env_by_name(args.env) cfg = default_cfg(env=args.env) if spec.num_agents <= 1: env = make_doom_env(args.env, cfg=cfg, custom_resolution='1280x720') else: env = make_doom_env_impl( spec, cfg=cfg, custom_resolution='1280x720', player_id=0, num_agents=spec.num_agents, max_num_players=spec.num_agents, num_bots=spec.num_bots, ) mode = 'replay' env.unwrapped.mode = mode env.unwrapped.initialize() game = env.unwrapped.game game.replay_episode(args.demo_path) frames_dir = args.demo_path + '_frames' if os.path.exists(frames_dir): shutil.rmtree(frames_dir) os.makedirs(frames_dir) frame_id = 0 while not game.is_episode_finished(): # Use advance_action instead of make_action. game.advance_action() img = env.render(mode='rgb_array') frame_name = f'{frame_id:05d}.png' img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) if img is not None: cv2.imwrite(join(frames_dir, frame_name), img) frame_id += 1 r = game.get_last_reward() log.debug('Reward %.3f at frame %d', r, frame_id) game.close()
def _actors_update_shaping_scheme(self, policy_id): log.debug('Sending latest reward scheme to actors for policy %d...', policy_id) for actor_worker in self.actor_workers: reward_scheme_task = (PbtTask.UPDATE_REWARD_SCHEME, (policy_id, self.policy_reward_shaping[policy_id])) task = (TaskType.PBT, reward_scheme_task) try: actor_worker.task_queue.put(task, timeout=0.1) except Full: log.warning( 'Could not add task %r to queue, it is likely that worker died', task)
def register_default_envs(env_registry): """ Register default envs. For this set of env families we register a function that can later create an actual registry entry when required. This allows us to import only Python modules that we use. """ def doom_funcs(): from sample_factory.envs.doom.doom_utils import make_doom_env from sample_factory.envs.doom.doom_params import add_doom_env_args, doom_override_defaults return make_doom_env, add_doom_env_args, doom_override_defaults def atari_funcs(): from sample_factory.envs.atari.atari_utils import make_atari_env from sample_factory.envs.atari.atari_params import atari_override_defaults return make_atari_env, None, atari_override_defaults def dmlab_funcs(): from sample_factory.envs.dmlab.dmlab_env import make_dmlab_env from sample_factory.envs.dmlab.dmlab_params import add_dmlab_env_args, dmlab_override_defaults return make_dmlab_env, add_dmlab_env_args, dmlab_override_defaults def mujoco_funcs(): from sample_factory.envs.mujoco.mujoco_utils import make_mujoco_env from sample_factory.envs.mujoco.mujoco_params import add_mujoco_env_args, mujoco_override_defaults return make_mujoco_env, add_mujoco_env_args, mujoco_override_defaults def minigrid_funcs(): from sample_factory.envs.minigrid.minigrid_utils import make_minigrid_env from sample_factory.envs.minigrid.minigrid_params import minigrid_override_defaults return make_minigrid_env, None, minigrid_override_defaults def voxel_env_funcs(): from sample_factory.envs.voxel_env.voxel_env_utils import make_voxel_env, add_voxel_env_args, voxel_env_override_defaults return make_voxel_env, add_voxel_env_args, voxel_env_override_defaults default_envs = { 'doom_': doom_funcs, 'atari_': atari_funcs, 'dmlab_': dmlab_funcs, 'mujoco_': mujoco_funcs, 'MiniGrid': minigrid_funcs, 'voxel_env_': voxel_env_funcs, } for envs_prefix, env_funcs in default_envs.items(): env_registry.register_env_deferred(envs_prefix, env_funcs) log.debug('Default env families supported: %r', [f'{k}*' for k in default_envs.keys()])
def __init__(self, env, initial_difficulty=None): super().__init__(env) self._min_difficulty = 0 self._max_difficulty = 150 self._difficulty_step = 10 self._curr_difficulty = 20 if initial_difficulty is None else initial_difficulty self._difficulty_std = 10 log.info('Starting with bot difficulty %d', self._curr_difficulty) self._adaptive_curriculum = True if initial_difficulty == self._max_difficulty: log.debug('Starting at max difficulty, disable adaptive skill curriculum') self._adaptive_curriculum = False
def register_env( self, env_name_prefix, make_env_func, add_extra_params_func=None, override_default_params_func=None, ): """ A standard thing to do in RL frameworks is to just rely on unique environment names registered in Gym. SampleFactory supports a mechanism on top of that, we define "environment families", e.g. "atari", or "doom", and certain things can be defined per env family rather than for specific environment or experiment (such as default hyperparameters and env command line arguments). For every supported family of environments we require four components: :param env_name_prefix: name prefix, e.g. atari_. This allows us to register a single entry per env family rather than individual env. Prefix can also, of course, be a full name of the environment. :param make_env_func: Factory function that creates an environment instance. This function is called like: make_my_env(full_env_name, cfg=cfg, env_config=env_config) Where full_env_name is a name of the environment to be created, cfg is a namespace with all CLI arguments, and env_config is an auxiliary dictionary containing information such as worker index on which the environment lives (some envs may require this information) :param add_extra_params_func: (optional) function that adds additional parameters to the argument parser. This is a very easy way to make your envs configurable through command-line interface. :param override_default_params_func: (optional) function that can override the default command line arguments in the parser. Every environment demands its own unique set of model architectures and hyperparameters, so this mechanism allows us to specify these default parameters once per family of envs to avoid typing them every time we want to launch an experiment. See the sample_factory_examples for the default envs, it's actually very simple. If you want to use a Gym env, just create an empty make_env_func that ignores other parameters and instantiates a copy of your Gym environment. """ assert callable(make_env_func), 'make_env_func should be callable' entry = EnvRegistryEntry(env_name_prefix, make_env_func, add_extra_params_func, override_default_params_func) self.registry[env_name_prefix] = entry log.debug('Env registry entry created: %s', env_name_prefix)
def _ensure_initialized(self): if self.initialized: return self.workers = [ MultiAgentEnvWorker(i, self.make_env_func, self.env_config, reset_on_init=self.reset_on_init) for i in range(self.num_agents) ] init_attempt = 0 while True: init_attempt += 1 try: port_to_use = udp_port_num(self.env_config) port = find_available_port(port_to_use, increment=1000) log.debug('Using port %d', port) init_info = dict(port=port) lock_file = doom_lock_file(max_parallel=20) lock = FileLock(lock_file) with lock.acquire(timeout=10): for i, worker in enumerate(self.workers): worker.task_queue.put((init_info, TaskType.INIT)) if self.safe_init: time.sleep(1.0) # just in case else: time.sleep(0.05) for i, worker in enumerate(self.workers): worker.result_queue.get(timeout=20) except filelock.Timeout: continue except Exception: raise RuntimeError( 'Critical error: worker stuck on initialization. Abort!') else: break log.debug('%d agent workers initialized for env %d!', len(self.workers), self.env_config.worker_index) self.initialized = True
def init(self, learner_workers, actor_workers): self.learner_workers = learner_workers self.actor_workers = actor_workers for policy_id in range(self.cfg.num_policies): # save the policy-specific configs if they don't exist, or else load them from files policy_cfg_filename = policy_cfg_file(self.cfg, policy_id) if os.path.exists(policy_cfg_filename): with open(policy_cfg_filename, 'r') as json_file: log.debug( 'Loading initial policy %d configuration from file %s', policy_id, policy_cfg_filename) json_params = json.load(json_file) self.policy_cfg[policy_id] = json_params else: self.policy_cfg[policy_id] = dict() for param_name in HYPERPARAMS_TO_TUNE: self.policy_cfg[policy_id][param_name] = self.cfg[ param_name] if policy_id > 0: # keep one policy with default settings in the beginning log.debug('Initial cfg mutation for policy %d', policy_id) self.policy_cfg[policy_id] = self._perturb_cfg( self.policy_cfg[policy_id]) for policy_id in range(self.cfg.num_policies): # save the policy-specific reward shaping if it doesn't exist, or else load from file policy_reward_shaping_filename = policy_reward_shaping_file( self.cfg, policy_id) if os.path.exists(policy_reward_shaping_filename): with open(policy_reward_shaping_filename, 'r') as json_file: log.debug( 'Loading policy %d reward shaping from file %s', policy_id, policy_reward_shaping_filename, ) json_params = json.load(json_file) self.policy_reward_shaping[policy_id] = json_params else: self.policy_reward_shaping[policy_id] = copy.deepcopy( self.default_reward_shaping) if policy_id > 0: # keep one policy with default settings in the beginning log.debug('Initial rewards mutation for policy %d', policy_id) self.policy_reward_shaping[ policy_id] = self._perturb_reward( self.policy_reward_shaping[policy_id]) # send initial configuration to the system components for policy_id in range(self.cfg.num_policies): self._save_cfg(policy_id) self._save_reward_shaping(policy_id) self._learner_update_cfg(policy_id) self._actors_update_shaping_scheme(policy_id)
def get_gpus_without_triggering_pytorch_cuda_initialization(envvars=None): if envvars is None: envvars = os.environ import subprocess out = subprocess.run([sys.executable, '-m', 'sample_factory.utils.get_available_gpus'], capture_output=True, env=envvars) text_output = out.stdout.decode() err_output = out.stderr.decode() returncode = out.returncode from sample_factory.utils.utils import log if returncode: log.error( 'Querying available GPUs... return code %d, error: %s, stdout: %s', returncode, err_output, text_output, ) log.debug('Queried available GPUs: %s', text_output) return text_output
def __init__(self, num_agents, make_env_func, env_config, skip_frames): gym.Env.__init__(self) RewardShapingInterface.__init__(self) self.num_agents = num_agents log.debug('Multi agent env, num agents: %d', self.num_agents) self.skip_frames = skip_frames # number of frames to skip (1 = no skip) env = make_env_func( player_id=-1 ) # temporary env just to query observation_space and stuff self.action_space = env.action_space self.observation_space = env.observation_space self.default_reward_shaping = get_default_reward_shaping(env) env.close() self.current_reward_shaping = [ self.default_reward_shaping for _ in range(self.num_agents) ] self.make_env_func = make_env_func self.safe_init = env_config is not None and env_config.get( 'safe_init', False) if self.safe_init: sleep_seconds = env_config.worker_index * 1.0 log.info( 'Sleeping %.3f seconds to avoid creating all envs at once', sleep_seconds) time.sleep(sleep_seconds) log.info('Done sleeping at %d', env_config.worker_index) self.env_config = env_config self.workers = None # only needed when rendering self.enable_rendering = False self.last_obs = None self.reset_on_init = True self.initialized = False
def record_used_seed(self, level, seed): self.num_seeds_used_in_current_run[level].value += 1 log.debug('Updated number of used seeds for level %s (%d)', level, self.num_seeds_used_in_current_run[level].value) used_lvl_seeds_dir = self.get_used_seeds_dir() used_seeds_filename = join(used_lvl_seeds_dir, level_to_filename(level)) safe_ensure_dir_exists(os.path.dirname(used_seeds_filename)) with open(used_seeds_filename, 'a') as fobj: fobj.write(f'{seed}\n') # this data structure is not shared across processes, but we mostly care about the initial # seeds anyway, which are initialized before the processes are forked if level not in self.used_seeds: self.used_seeds[level] = {seed} else: self.used_seeds[level].add(seed)
def __init__(self, cfg, obs_space, timing): super().__init__(cfg, timing) self.basic_encoder = create_standard_encoder(cfg, obs_space, timing) self.encoder_out_size = self.basic_encoder.encoder_out_size obs_shape = get_obs_shape(obs_space) self.measurements_head = None if 'measurements' in obs_shape: self.measurements_head = nn.Sequential( nn.Linear(obs_shape.measurements[0], 128), nonlinearity(cfg), nn.Linear(128, 128), nonlinearity(cfg), ) measurements_out_size = calc_num_elements(self.measurements_head, obs_shape.measurements) self.encoder_out_size += measurements_out_size log.debug('Policy head output size: %r', self.get_encoder_out_size())