def error_checks(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None): """ Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration. Checks that: - Throws error when hydra has changed the working directory. This causes issues with lightning's DDP - Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger is True - Prints error messages when 1) run on multi-node and not Slurm, and 2) run on multi-gpu without DDP """ if HydraConfig.initialized() and get_original_cwd() != os.getcwd(): raise ValueError( "Hydra changed the working directory. This interferes with ExpManger's functionality. Please pass " "hydra.run.dir=. to your python script.") if trainer.logger is not None and (cfg.create_tensorboard_logger or cfg.create_wandb_logger): raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, and either " f"create_tensorboard_logger: {cfg.create_tensorboard_logger} or create_wandb_logger: " f"{cfg.create_wandb_logger} was set to True. These can only be used if trainer does not already have a" " logger.") if trainer.num_nodes > 1 and not check_slurm(trainer): logging.error( "You are running multi-node training without SLURM handling the processes." " Please note that this is not tested in NeMo and could result in errors." ) if trainer.num_gpus > 1 and not isinstance( trainer.accelerator.training_type_plugin, DDPPlugin): logging.error( "You are running multi-gpu without ddp.Please note that this is not tested in NeMo and could result in " "errors.")
def get_class_dist(labels_file="78-classes_train.json", str2id_file="78-classes_labels.json"): labels_dir = "./datasets/20bn-sth-sth-v2/labels/" if HydraConfig.initialized(): labels_dir = os.path.join(hydra.utils.get_original_cwd(), labels_dir) data_info_path = os.path.join(labels_dir, labels_file) else: labels_dir = os.path.join(os.getcwd(), labels_dir) data_info_path = os.path.abspath(os.path.join(labels_dir, labels_file)) data_info_frame = pd.read_json(data_info_path) ids_frame = pd.read_json(os.path.join(labels_dir, str2id_file),\ typ='series') data_info_frame["template"] = data_info_frame["template"].str.replace( "[", "") data_info_frame["template"] = data_info_frame["template"].str.replace( "]", "") #classes present in labels_file (78) classes = ids_frame[data_info_frame["template"]] unique, counts = np.unique(classes, return_counts=True) old2new = map_orig2new() #str: new id classes = ids_frame[unique].replace(old2new) #returns unique_str = ids_frame[unique] unique_new_ids = [old2new[i] for i in unique] #new_ids: count new_ids_counts = dict(zip(unique_new_ids, counts)) #str_count: str_counts = dict(zip(unique_str, counts)) #new_ids in order of the dataframe class_labels = list(classes[data_info_frame["template"]]) return new_ids_counts, str_counts, class_labels
def hydra_instance() -> Union[Hydra, GlobalHydra]: "Provide Hydra/GlobalHydra instance for compose" if HydraConfig.initialized(): yield GlobalHydra.instance() hydra_init = initialize(config_path="../peddet/conf") yield hydra_init GlobalHydra.instance().clear()
def __enter__(self): # do nothing if hydra is not initialized if not HydraConfig.initialized() or not self.input_dir: return self.hydra_out_dir = os.getcwd() original_dir = os.path.join(hydra.utils.get_original_cwd(), self.input_dir) print(f"Switching load directory to {original_dir}") os.chdir(original_dir)
def get_original_cwd() -> str: """ :return: the original working directory the Hydra application was launched from """ if not HydraConfig.initialized(): raise ValueError( "get_original_cwd() must only be used after HydraConfig is initialized" ) ret = HydraConfig.get().runtime.cwd assert ret is not None and isinstance(ret, str) return ret
def to_absolute_path(path: str) -> str: """ converts the specified path to be absolute path. if the input path is relative, it's interpreted as relative to the original working directory if it's absolute, it's returned as is :param path: path to convert :return: """ p = Path(path) if not HydraConfig.initialized(): base = Path(os.getcwd()) else: base = Path(get_original_cwd()) if p.is_absolute(): ret = p else: ret = base / p return str(ret)
def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: add_defaults(cfg) if cfg.common.reset_logging: reset_logging() # Hydra hijacks logging, fix that else: # check if directly called or called through hydra_main if HydraConfig.initialized(): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True) with omegaconf_no_object_check(): cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)) OmegaConf.set_struct(cfg, True) try: if cfg.common.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): distributed_utils.call_main(cfg, pre_main, **kwargs) else: distributed_utils.call_main(cfg, pre_main, **kwargs) except BaseException as e: if not cfg.common.suppress_crashes: raise else: logger.error("Crashed! " + str(e)) # get best val and return - useful for sweepers try: best_val = metrics.get_smoothed_value( "valid", cfg.checkpoint.best_checkpoint_metric) except: best_val = None if best_val is None: best_val = float("inf") return best_val
def _call_children_scripts(self): # bookkeeping of spawned processes assert self.local_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # create a temporary directory used to synchronize processes on deadlock. os.environ["PL_DDP_SYNC_TMPDIR"] = self._sync_dir = tempfile.mkdtemp() # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover # Script called as `python a/b/c.py` # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name ] + sys.argv[1:] # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.parallel_devices is None: raise MisconfigurationException( "you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)" ) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" self.interactive_ddp_procs = [] for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" if self.lightning_module.logger is not None: # spawned processes must reference the same log dir, prevent auto-increment version env_copy["PL_EXP_VERSION"] = str( self.lightning_module.logger.version) # remove env var if global seed not set if os.environ.get( "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [ f'hydra.run.dir={os_cwd}', f'hydra.job.name=train_ddp_process_{local_rank}' ] proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def spawn_ddp_children(self, model): port = os.environ['MASTER_PORT'] master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ[ 'MASTER_ADDR'] os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank node_rank = '0' if 'NODE_RANK' in os.environ: node_rank = os.environ['NODE_RANK'] if 'GROUP_RANK' in os.environ: node_rank = os.environ['GROUP_RANK'] os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # since this script sets the visible devices we replace the gpus flag with a number num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__() if '--gpus' in command: gpu_flag_idx = command.index('--gpus') command[gpu_flag_idx + 1] = f'{num_gpus}' os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.trainer.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) local_rank = 0 results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True) del os.environ['WORLD_SIZE'] return results
def get_cwd(): if HydraConfig.initialized(): cwd = Path(get_original_cwd()) else: cwd = Path.cwd() return cwd
def _call_children_scripts(self) -> None: # bookkeeping of spawned processes self._check_can_spawn_children() # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.main_address os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover # Script called as `python a/b/c.py` # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command else: # Script called as `python -m a.b.c` command = [sys.executable, "-m", __main__.__spec__.name ] + sys.argv[1:] os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" # remove env var if global seed not set if os.environ.get( "PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [ f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}" ] subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def __ddp_script_mode_setup(self): assert self.trainer.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) # allow the user to pass the node rank node_rank = '0' node_rank = os.environ.get('NODE_RANK', node_rank) node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '') if len(gpu_ids) == 1: gpu_ids = f'{gpu_ids},' num_gpus = max(1, len(gpu_ids.split(','))) # set the flag for ddp scripts os.environ['PL_TRAINER_GPUS'] = gpu_ids os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.trainer.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay) self.task_idx = 0
def _call_children_scripts(self): assert self.trainer.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = os.environ.get( 'MASTER_PORT', str(find_free_network_port())) # allow the user to pass the node rank node_rank = '0' node_rank = os.environ.get('NODE_RANK', node_rank) node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' # when user is using hydra find the absolute path path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception as e: full_path = abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.trainer.data_parallel_device_ids is None: raise MisconfigurationException( 'you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)' ) os.environ['PL_TRAINER_GPUS'] = ','.join( [str(i) for i in self.trainer.data_parallel_device_ids]) os.environ['PL_IN_DDP_SUBPROCESS'] = '1' if self.trainer.logger is not None: os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version) num_gpus = len(self.trainer.data_parallel_device_ids) os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' # remove env var if global seed not set if os.environ.get( 'PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy: del env_copy['PL_GLOBAL_SEED'] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def _call_children_scripts(self): # bookkeeping of spawned processes assert self.global_rank == 0 self._check_can_spawn_children() self._has_spawned_children = True # DDP Environment variables os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) # allow the user to pass the node rank os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) # when user is using hydra find the absolute path path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path # pull out the commands used to run the script and resolve the abs file path command = sys.argv try: full_path = path_lib(command[0]) except Exception: full_path = os.path.abspath(command[0]) command[0] = full_path # use the same python interpreter and actually running command = [sys.executable] + command # the visible devices tell us how many GPUs we want to use. # when the trainer script was called the device has already been scoped by the time # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone # but forward the GPUs selected via environment variables if self.parallel_devices is None: raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)") os.environ["PL_TRAINER_GPUS"] = ",".join([str(device.index) for device in self.parallel_devices]) os.environ["PL_IN_DDP_SUBPROCESS"] = "1" if self.lightning_module.logger is not None: os.environ["PL_EXP_VERSION"] = str(self.lightning_module.logger.version) num_gpus = len(self.parallel_devices) os.environ["WORLD_SIZE"] = f"{num_gpus * self.num_nodes}" self.interactive_ddp_procs = [] for local_rank in range(1, self.num_processes): env_copy = os.environ.copy() env_copy["LOCAL_RANK"] = f"{local_rank}" # remove env var if global seed not set if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: del env_copy["PL_GLOBAL_SEED"] # start process # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' command += [f'hydra.run.dir={os_cwd}', f'hydra.job.name=train_ddp_process_{local_rank}'] proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.interactive_ddp_procs.append(proc) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] sleep(delay)
def simulate(self, input_mappings): """Simulate multiple mappings. Args: input_mappings: input mappings Returns: list of the objects of the class `SimulationResult`. The length of the list is equal to the length of `input_mappings`. """ # check inputs if len(input_mappings) == 0: log.warning("Trying to simulate an empty mapping list") return [] time = process_time() if isinstance(input_mappings[0], Mapping): tup = [ tuple(self.representation.toRepresentation(m)) for m in input_mappings ] mappings = input_mappings else: # assume mappings are list type then # transform into tuples tup = [ tuple(self.representation.approximate(np.array(m))) for m in input_mappings ] mappings = [self.representation.fromRepresentation(m) for m in tup] self.statistics.add_rep_time(process_time() - time) # first look up as many as possible: lookups = [self.lookup(t) for t in tup] num = len([m for m in lookups if m]) log.info(f"{num} from cache.") self.statistics.mappings_cached(num) # if all were already cached, return them if num == len(tup): return lookups # create a list of simulations to be run. # each element is a tuple (simulation, hydra_configuration) simulations = [] # Logging are not configured in the spawned processes on mac OS. # As a workaround, suggested in # https://github.com/facebookresearch/hydra/issues/1005 # we pass the hydra configuration to the child processes cfg_pickled = None if HydraConfig.initialized(): config = HydraConfig.get() cfg_pickled = cloudpickle.dumps(config) for i, mapping in enumerate(mappings): # skip if this particular mapping is in the cache if lookups[i]: continue simulation = DataflowSimulation(self.platform, self.graph, mapping, self.trace) simulations.append((simulation, cfg_pickled)) if self.parallel and len(simulations) > self.chunk_size: # since mappings are simulated in parallel, whole simulation time # is added later as offset for _ in simulations: self.statistics.mapping_evaluated(0) # run the simulations in parallel with mp.Pool(processes=self.jobs) as pool: to_simulate = pool.imap( run_simulation_logger_wrapper, simulations, chunksize=self.chunk_size, ) if self.progress: import tqdm to_simulate = tqdm.tqdm( to_simulate, total=len(mappings), ) simulated = list(to_simulate) time = sum([s[1] for s in simulated]) simulated = [s[0] for s in simulated] self.statistics.add_offset(time) else: simulated = [] # run the simulations sequentially for s in simulations: s, time = run_simulation(s[0]) simulated.append(s) self.statistics.mapping_evaluated(time) # Collect the simulation results and store them sim_results = [] sim_iter = iter(simulated) for i, mapping in enumerate(mappings): sim_res = lookups[i] if sim_res: sim_results.append(sim_res) else: s = next(sim_iter) sim_results.append(s.result) self.add_mapping_result(tup[i], s.result) return sim_results
def __exit__(self, *args): # do nothing if hydra is not initialized if not HydraConfig.initialized() or not self.input_dir: return os.chdir(self.hydra_out_dir)