def download_dataset(self): try: if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run: download_dataset_path = Path(__file__).parent.joinpath('download').joinpath('download_dataset.py') run_per_ip(f"{sys.executable} {str(download_dataset_path)} {self.args.dataset_path}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) else: download_dataset.download_dataset_r(self.args.dataset_path) except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} download_dataset()") from exc
def prepare_results_path(self, results_dir): try: if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run: prepare_output_dir_path = Path(__file__).parent.parent.parent.parent.parent.joinpath('central').joinpath('prepare_output_dir.py') run_per_ip(f"{sys.executable} {str(prepare_output_dir_path)} {results_dir}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) else: prepare_output_dir.prepare_output_dir_r(results_dir) except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} prepare_results_path({results_dir})") from exc
def prepare_output_dir(self): try: if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run: prepare_output_dir_squad_path = Path(__file__).parent.joinpath('prepare_output_dir_squad.py') run_per_ip(f"{sys.executable} {str(prepare_output_dir_squad_path)} {self.args.output_dir} {self.batch_size} {self.max_seq_len}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) else: prepare_output_dir_squad.prepare_output_dir_squad_r(self.args.output_dir, self.batch_size, self.max_seq_len) except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} prepare_output_dir()") from exc
def create_pretraining_data(self, seq_length, max_pred_per_seq): try: if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run: create_pt_data_path = Path(__file__).parent.joinpath('data_preprocessing').joinpath('create_pretraining_data_overfit.py') run_per_ip(f"{sys.executable} {str(create_pt_data_path)} {self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) else: create_pretraining_data_overfit.create_pretraining_data_overfit_r(self.dataset_path, self.pretrained_model, seq_length, max_pred_per_seq) except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} create_pretraining_data({self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq})") from exc
def download_pretrained_model(self, horovod_run): try: download_pretrained_model_path = Path(__file__).parent.joinpath( 'download').joinpath('download_pretrained_model.py') if horovod_run and is_valid_multi_node_config(): run_per_ip(f"{sys.executable} {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False", [ 'MULTI_HLS_IPS', 'PYTHONPATH'], False) else: run_cmd_as_subprocess(f"{sys.executable} {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False") except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} download_pretrained_model()") from exc
def check_dirs(self, largs): try: if self.scaleout and is_valid_multi_node_config( ) and not self.kubernetes_run: check_dirs_path = Path( __file__).parent.parent.parent.parent.joinpath( 'central').joinpath('check_dirs.py') run_per_ip(f"{sys.executable} {str(check_dirs_path)} {largs}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) else: check_dirs.check_dirs_r(largs.split()) except Exception as exc: raise RuntimeError( f"Error in {self.__class__.__name__} check_dirs(largs)" ) from exc
def setup_config_env(self): print(f"self.world_size = {self.__world_size}") tmp_dir = '/tmp' __worker_per_node = self.__world_size gen_hcl_config = True # Dont generate HCL config in below scenarios # HCCL host NIC scaling is enabled.i.e. "HCCL_OVER_TCP" is 1/True # HCCL libfabric host NIC scaling is enabled.i.e. "HCCL_OVER_OFI" is 1/True # HCL_CONFIG_PATH is already set hccl_over_tcp = os.getenv("HCCL_OVER_TCP") hccl_over_ofi = os.getenv("HCCL_OVER_OFI") if hccl_over_tcp or hccl_over_ofi: if hccl_over_tcp: hccl_over_tcp = hccl_over_tcp.lower() in ["1", "true"] if hccl_over_ofi: hccl_over_ofi = hccl_over_ofi.lower() in ["1", "true"] print(f"HCCL_OVER_TCP={os.getenv('HCCL_OVER_TCP')}") print(f"HCCL_OVER_OFI={os.getenv('HCCL_OVER_OFI')}") if hccl_over_tcp or hccl_over_ofi: print("skiping HCL config generation") gen_hcl_config = False if os.getenv("HCL_CONFIG_PATH"): print("HCL_CONFIG_PATH is already set") print("skiping HCL config generation") gen_hcl_config = False if self.__multi_hls: __cnt = len(os.getenv("MULTI_HLS_IPS").split(',')) gen_hcl_path = Path(__file__).parent.parent.parent.joinpath( 'central/generate_hcl_config.py') # Create HCL config on each remote IP. if gen_hcl_config: __worker_per_node = self.__world_size // __cnt run_per_ip((f"{sys.executable} {str(gen_hcl_path)} {tmp_dir} " f"{__worker_per_node} {self.___hls_type}"), ['MULTI_HLS_IPS', 'PYTHONPATH'], False) if gen_hcl_config and self.__world_size > 1: # HCL_CONFIG_PATH env var is set in generate_hcl_config_r() generate_hcl_config.generate_hcl_config_unless_hccl( f'{tmp_dir}', __worker_per_node, hls_type=self.___hls_type) print( f"HLS ({self.__world_size}): HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}" )
def create_multi_hls_setup(self, tmp_dir): # # Multi-HLS Mode # gen_hcl_path = Path(__file__).parent.joinpath('generate_hcl_config.py') # Create HCL config on each remote IP. run_per_ip(f"{sys.executable} {str(gen_hcl_path)} {str(tmp_dir)} {self.num_workers_per_hls} {self.hls_type}", [ 'MULTI_HLS_IPS', 'PYTHONPATH', 'HOROVOD_HIERARCHICAL_ALLREDUCE'], False) # Set HCL_CONFIG_PATH in this script, so it can be propagated in self.mpirun_cmd to remote IPs. hcl_config_path = generate_hcl_config.generate_hcl_config_unless_hccl( str(tmp_dir), self.num_workers_per_hls, self.hls_type) multi_hls_nodes = get_multi_node_config_nodes() self.num_workers_total = len( multi_hls_nodes) * self.num_workers_per_hls print(f"self.num_workers_total = {self.num_workers_total}") print( f"++++++++++ Multi-HLS ({self.num_workers_total}-cards): effective HCL_CONFIG_PATH = {hcl_config_path}") mpi_hostfile_path = generate_mpi_hostfile( str(tmp_dir), self.num_workers_per_hls) assert mpi_hostfile_path != '', "Don\'t have a valid mpi_hostfile_path for MULTI_HLS_IPS scenario" print(f"mpi_hostfile_path = {mpi_hostfile_path} ->") print_file_contents(mpi_hostfile_path) self.mpirun_cmd += f" -np {self.num_workers_total}" if os.environ.get('DOCKER_SSHD_PORT'): portnum = os.environ.get('DOCKER_SSHD_PORT') else: portnum = 3022 self.mpirun_cmd += f" --mca plm_rsh_args -p{portnum}" self.mpirun_cmd += f" --mca btl_tcp_if_include {get_mpi_tcp_include()}" self.mpirun_cmd += f" -hostfile {mpi_hostfile_path}" self.mpirun_cmd += " --prefix $MPI_ROOT" for env_var in get_relevant_env_vars(): self.mpirun_cmd += f" -x {env_var}={shlex.quote(os.environ[env_var])}" # Note that =value above in not necessary, but provides a vital information when presented this way in the log file. return hcl_config_path
def create_multi_worker_setup(self): if not self.kubernetes_run: assert self.scaleout and self.num_workers_per_hls > 1, "Scaleout run requires at least 2 workers" tmp_dir = Path(os.path.expandvars(os.path.expanduser("$HOME/tmp/"))) run_per_ip(f"mkdir -p {str(tmp_dir)}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False, self.kubernetes_run) hcl_config_path = '' if self.kubernetes_run: hcl_config_path = Path( os.environ.get('HCL_CONFIG_PATH')) # Printing env var HCL_CONFIG_PATH has been proven to be misleading. # print( # f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}") print(f"Effective HCL_CONFIG_PATH = {hcl_config_path} ->") print_file_contents(hcl_config_path) return print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}") output_file_name = str(tmp_dir.joinpath(self.output_filename)) self.mpirun_cmd = self.create_mpi_cmdline(output_file_name) if is_valid_multi_node_config(): hcl_config_path = self.create_multi_hls_setup(tmp_dir) else: hcl_config_path = self.create_single_hls_setup(tmp_dir) # Printing env var HCL_CONFIG_PATH has been proven to be misleading. #print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}") if hcl_config_path is not None: print(f"Effective HCL_CONFIG_PATH = {hcl_config_path} ->") print_file_contents(hcl_config_path) else: print(f"HCL Config is not used in this run.") print(f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}")