Exemple #1
0
    def create_multi_worker_setup(self):
        assert self.use_horovod and self.num_workers_per_hls > 1, "Horovod run requires at least 2 workers"
        self.run_config_env_variables[
            'NUM_WORKERS_PER_HLS'] = f"{self.num_workers_per_hls}"
        tmp_dir = get_canonical_path("$HOME/tmp/")
        run_per_ip(f"mkdir -p {str(tmp_dir)}", ['MULTI_HLS_IPS', 'PYTHONPATH'],
                   False)
        print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}")

        # OpenMPI process bind resource type.
        mpi_map_by = "socket"

        # Get lscpu
        cmd = 'lscpu | grep \"CPU(s):\"'
        lscpu_output = []
        with subprocess.Popen(cmd,
                              shell=True,
                              executable='/bin/bash',
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT) as proc:
            lscpu_output = proc.stdout.read()
        # Determine the optimal value of resources per process of OpenMPI binding based on local lscpu.
        if mpi_map_by == "socket":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls // 2
        elif mpi_map_by == "slot":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls
        else:
            raise Exception("mpi_map_by must be either 'socket' or 'slot'.")

        print(f"mpi_map_by_pe = {mpi_map_by_pe}")

        output_file_name = str(tmp_dir.joinpath("demo_bert_log/"))
        self.mpirun_cmd = "mpirun"
        self.mpirun_cmd += " --allow-run-as-root"
        self.mpirun_cmd += f" --tag-output --merge-stderr-to-stdout --output-filename {output_file_name}"

        if mpi_map_by_pe > 0:
            self.mpirun_cmd += f" --bind-to core --map-by {mpi_map_by}:PE={mpi_map_by_pe}"

        hcl_config_path = ''

        if is_valid_multi_node_config():
            hcl_config_path = self.create_multi_hls_setup(tmp_dir)
        else:
            hcl_config_path = self.create_single_hls_setup(tmp_dir)

        print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}")
        print(f"hcl_config_path = {hcl_config_path} ->")
        print_file_contents(hcl_config_path)

        os.environ['MPIRUN_CMD'] = self.mpirun_cmd
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}"
        )
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): MPIRUN_CMD = {os.environ.get('MPIRUN_CMD')}"
        )
 def prepare_results_path(self, results_dir):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             prepare_output_dir_path = Path(__file__).parent.parent.parent.joinpath('common').joinpath('prepare_output_dir.py')
             run_per_ip(f"python3 {str(prepare_output_dir_path)} {results_dir}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir.prepare_output_dir_r(results_dir)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} prepare_results_path({results_dir})") from exc
 def create_pretraining_data(self, seq_length, max_pred_per_seq):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             create_pt_data_path = Path(__file__).parent.joinpath('create_pretraining_data_overfit.py')
             run_per_ip(f"python3 {str(create_pt_data_path)} {self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             create_pretraining_data_overfit.create_pretraining_data_overfit_r(self.dataset_path, self.pretrained_model, seq_length, max_pred_per_seq)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} create_pretraining_data({self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq})") from exc
 def check_dirs(self, largs):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             check_dirs_path = Path(__file__).parent.parent.parent.joinpath(
                 'common').joinpath('check_dirs.py')
             run_per_ip(f"python3 {str(check_dirs_path)} {largs}",
                        ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             check_dirs.check_dirs_r(largs.split())
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} check_dirs(largs)"
         ) from exc
 def download_dataset(self):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             download_dataset_path = Path(__file__).parent.joinpath(
                 'download').joinpath('download_dataset.py')
             run_per_ip(
                 f"python3 {str(download_dataset_path)} {self.args.dataset_path}",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             download_dataset.download_dataset_r(self.args.dataset_path)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} download_dataset()"
         ) from exc
 def prepare_output_dir(self):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             prepare_output_dir_squad_path = Path(__file__).parent.joinpath(
                 'prepare_output_dir_squad.py')
             run_per_ip(
                 f"python3 {str(prepare_output_dir_squad_path)} {self.args.output_dir} {self.batch_size} {self.max_seq_len}",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir_squad.prepare_output_dir_squad_r(
                 self.args.output_dir, self.batch_size, self.max_seq_len)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} prepare_output_dir()"
         ) from exc
 def download_pretrained_model(self, horovod_run):
     try:
         if horovod_run and is_valid_multi_node_config():
             download_pretrained_model_path = Path(
                 __file__).parent.joinpath('download').joinpath(
                     'download_pretrained_model.py')
             run_per_ip(
                 f"python3 {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             download_pretrained_model.download_pretrained_model_r(
                 self.pretrained_url, self.pretrained_model, False)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} download_pretrained_model()"
         ) from exc
Exemple #8
0
    def create_multi_hls_setup(self, tmp_dir):
        #
        # Multi-HLS Mode
        #
        if os.environ.get('MPI_TPC_INCLUDE'):
            mpi_tpc_include = os.environ.get('MPI_TPC_INCLUDE')
        else:
            mpi_tpc_include = "enp3s0"
        print(f"mpi_tpc_include = {mpi_tpc_include}")

        gen_hcl_path = Path(__file__).parent.joinpath('generate_hcl_config.py')
        # Create HCL config on each remote IP.
        run_per_ip(
            f"python3 {str(gen_hcl_path)} {str(tmp_dir)} {self.num_workers_per_hls} \"HLS1\"",
            ['MULTI_HLS_IPS', 'PYTHONPATH'], False)

        # Set HCL_CONFIG_PATH in this script, so it can be propagated in self.mpirun_cmd to remote IPs.
        hcl_config_path = generate_hcl_config.generate_hcl_config_r(
            str(tmp_dir), self.num_workers_per_hls)

        multi_hls_nodes = get_multi_node_config_nodes()
        self.num_workers_total = len(
            multi_hls_nodes) * self.num_workers_per_hls
        print(f"self.num_workers_total = {self.num_workers_total}")
        print(
            f"++++++++++ Multi-HLS ({self.num_workers_total}-cards): HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}"
        )

        mpi_hostfile_path = generate_mpi_hostfile(str(tmp_dir))
        assert mpi_hostfile_path != '', "Don\'t have a valid mpi_hostfile_path for MULTI_HLS_IPS scenario"
        print(f"mpi_hostfile_path = {mpi_hostfile_path} ->")
        print_file_contents(mpi_hostfile_path)

        self.mpirun_cmd += f" -np {self.num_workers_total}"
        if os.environ.get('DOCKER_SSHD_PORT'):
            portnum = os.environ.get('DOCKER_SSHD_PORT')
        else:
            portnum = 3022
        self.mpirun_cmd += f" --mca plm_rsh_args -p{portnum}"
        self.mpirun_cmd += f" --mca btl_tcp_if_include {mpi_tpc_include}"
        self.mpirun_cmd += f" -hostfile {mpi_hostfile_path}"
        # in case you deployed a docker image
        self.mpirun_cmd += " --prefix /usr/lib/habanalabs/openmpi/"
        # in case you invoked build_horovod manually
        #self.mpirun_cmd += " --prefix $HOME/.openmpi/"
        self.mpirun_cmd += " -x HCL_CONFIG_PATH"

        self.mpirun_cmd += " -x HABANA_USE_PREALLOC_BUFFER_FOR_ALLREDUCE"
        self.mpirun_cmd += " -x TF_ENABLE_BF16_CONVERSION"
        self.mpirun_cmd += " -x TF_ALLOW_CONTROL_EDGES_IN_HABANA_OPS"
        self.mpirun_cmd += " -x HBN_TF_REGISTER_DATASETOPS"
        self.mpirun_cmd += " -x HABANA_USE_STREAMS_FOR_HCL"
        self.mpirun_cmd += " -x TF_PRELIMINARY_CLUSTER_SIZE"
        self.mpirun_cmd += " -x HABANA_INITIAL_WORKSPACE_SIZE_MB"
        self.mpirun_cmd += " -x RUN_TPC_FUSER"
        self.mpirun_cmd += " -x TF_DISABLE_SCOPED_ALLOCATOR"

        self.mpirun_cmd += " -x LD_PRELOAD"
        self.mpirun_cmd += " -x TF_MODULES_RELEASE_BUILD"
        self.mpirun_cmd += " -x PYTHONPATH"
        self.mpirun_cmd += " -x GC_KERNEL_PATH"
        self.mpirun_cmd += " -x HABANA_LOGS"
        self.mpirun_cmd += " -x VIRTUAL_ENV"
        self.mpirun_cmd += " -x PATH"
        self.mpirun_cmd += " -x LD_LIBRARY_PATH"
        return hcl_config_path