def create_pretraining_data_overfit_r(dataset_path, pretrained_model, seq_length, max_pred_per_seq):
    host_name = socket.gethostname()
    try:
        ds_path = get_canonical_path(dataset_path)
        if not os.path.isdir(ds_path):
            os.makedirs(ds_path, mode=0o777, exist_ok=True)
            run_create_pretraining_path = Path(__file__).parent.joinpath('create_pretraining_data.py')
            input_file_path = Path(__file__).parent.joinpath("sample_text.txt")
            output_file_path = ds_path.joinpath("tf_examples.tfrecord")
            pretrained_model_path = get_canonical_path(pretrained_model)
            vocab_file_path = pretrained_model_path.joinpath("vocab.txt")

            command = (
                f"python3 {str(run_create_pretraining_path)}"
                f" --input_file={str(input_file_path)}"
                f" --output_file={str(output_file_path)}"
                f" --vocab_file={str(vocab_file_path)}"
                f" --do_lower_case=True"
                f" --max_seq_length={seq_length}"
                f" --max_predictions_per_seq={max_pred_per_seq}"
                f" --masked_lm_prob=0.15"
                f" --random_seed=12345"
                f" --dupe_factor=5"
            )
            print(f"{host_name}: {__file__}: create_pretraining_data_overfit_r() command = {command}")
            sys.stdout.flush()
            sys.stderr.flush()
            with subprocess.Popen(command, shell=True, executable='/bin/bash') as proc:
                proc.wait()
    except Exception as exc:
        raise Exception(f"{host_name}: Error in {__file__} create_pretraining_data_overfit_r({dataset_path}, {pretrained_model}, {seq_length}, {max_pred_per_seq})") from exc
    def build_command(self):
        try:
            run_classifier_path = Path(__file__).parent.joinpath(
                'run_classifier.py')
            pretrained_model_path = get_canonical_path(self.pretrained_model)
            use_horovod_str = "true" if self.use_horovod else "false"
            vocab_path = str(pretrained_model_path.joinpath("vocab.txt"))
            bcfg_path = str(pretrained_model_path.joinpath("bert_config.json"))
            ic_path = str(pretrained_model_path.joinpath("bert_model.ckpt"))

            print(
                f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}"
            )
            if self.mpirun_cmd == '':
                init_command = f"time python3 {str(run_classifier_path)}"
            else:
                init_command = f"time {self.mpirun_cmd} python3 {str(run_classifier_path)}"
            self.command = (
                f"{init_command}"
                f" --task_name=MRPC --do_train=true --do_eval=true --data_dir={get_canonical_path_str(self.args.dataset_path)}"
                f" --vocab_file={vocab_path}"
                f" --bert_config_file={bcfg_path}"
                f" --init_checkpoint={ic_path}"
                f" --max_seq_length={self.max_seq_len}"
                f" --train_batch_size={self.batch_size}"
                f" --learning_rate={self.args.learning_rate}"
                f" --num_train_epochs={self.epochs}"
                f" --output_dir={get_canonical_path_str(self.args.output_dir)}"
                f" --use_horovod={use_horovod_str}")
            print('bert_mrpc_utils::self.command = ', self.command)
        except Exception as exc:
            raise RuntimeError(
                f"Error in {self.__class__.__name__} build_command()") from exc
Esempio n. 3
0
    def __init__(self, args):
        self.args = args
        self.use_horovod = False
        self.num_workers_per_hls = 1
        self.scaleout = False
        self.hls_ips = ''
        self.mpirun_cmd = ''
        if self.args.use_horovod is not None:
            self.use_horovod = True
            self.num_workers_per_hls = self.args.use_horovod
            self.scaleout = True

        self.num_workers_total = self.num_workers_per_hls

        print(
            f"use_horovod = {self.use_horovod}, num_workers_per_hls = {self.num_workers_per_hls}"
        )

        self.run_config_env_variables = {}

        os.makedirs(get_canonical_path("$HOME/tmp/"),
                    mode=0o777,
                    exist_ok=True)
        if self.use_horovod:
            self.create_multi_worker_setup()
        else:
            self.create_single_worker_setup()
Esempio n. 4
0
    def create_multi_worker_setup(self):
        assert self.use_horovod and self.num_workers_per_hls > 1, "Horovod run requires at least 2 workers"
        self.run_config_env_variables[
            'NUM_WORKERS_PER_HLS'] = f"{self.num_workers_per_hls}"
        tmp_dir = get_canonical_path("$HOME/tmp/")
        run_per_ip(f"mkdir -p {str(tmp_dir)}", ['MULTI_HLS_IPS', 'PYTHONPATH'],
                   False)
        print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}")

        # OpenMPI process bind resource type.
        mpi_map_by = "socket"

        # Get lscpu
        cmd = 'lscpu | grep \"CPU(s):\"'
        lscpu_output = []
        with subprocess.Popen(cmd,
                              shell=True,
                              executable='/bin/bash',
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT) as proc:
            lscpu_output = proc.stdout.read()
        # Determine the optimal value of resources per process of OpenMPI binding based on local lscpu.
        if mpi_map_by == "socket":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls // 2
        elif mpi_map_by == "slot":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls
        else:
            raise Exception("mpi_map_by must be either 'socket' or 'slot'.")

        print(f"mpi_map_by_pe = {mpi_map_by_pe}")

        output_file_name = str(tmp_dir.joinpath("demo_bert_log/"))
        self.mpirun_cmd = "mpirun"
        self.mpirun_cmd += " --allow-run-as-root"
        self.mpirun_cmd += f" --tag-output --merge-stderr-to-stdout --output-filename {output_file_name}"

        if mpi_map_by_pe > 0:
            self.mpirun_cmd += f" --bind-to core --map-by {mpi_map_by}:PE={mpi_map_by_pe}"

        hcl_config_path = ''

        if is_valid_multi_node_config():
            hcl_config_path = self.create_multi_hls_setup(tmp_dir)
        else:
            hcl_config_path = self.create_single_hls_setup(tmp_dir)

        print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}")
        print(f"hcl_config_path = {hcl_config_path} ->")
        print_file_contents(hcl_config_path)

        os.environ['MPIRUN_CMD'] = self.mpirun_cmd
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}"
        )
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): MPIRUN_CMD = {os.environ.get('MPIRUN_CMD')}"
        )
def generate_mpi_hostfile(file_path):
    mpi_hostfile_path = ''
    if is_valid_multi_node_config():
        multi_hls_nodes = get_multi_node_config_nodes()
        print("Generating MPI hostfile...")
        file_name = "hostfile"
        os.makedirs(get_canonical_path(file_path), mode=0o777, exist_ok=True)
        mpi_hostfile_path = get_canonical_path(file_path).joinpath(file_name)
        if os.path.exists(mpi_hostfile_path):
            #os.remove(mpi_hostfile_path)
            cmd = f"rm -f {str(mpi_hostfile_path)}"
            run_cmd_as_subprocess(cmd)
        print(f"Path: {mpi_hostfile_path}")
        out_fid = open(mpi_hostfile_path, 'a')
        config_str = ''
        for node in multi_hls_nodes:
            config_str += f"{node} slots=8\n"
        print(f"MPI hostfile: \n{config_str}")
        out_fid.write(config_str)
        out_fid.close()
    return mpi_hostfile_path
def prepare_output_dir_r(output_dir):
    host_name = socket.gethostname()
    try:
        od_path = get_canonical_path(output_dir)
        if os.path.isdir(od_path):
            print(f"{host_name}: *** Cleaning existing {str(od_path)}...\n\n")
            #shutil.rmtree(od_path)
            cmd = f"rm -rf {get_canonical_path_str(od_path)}"
            run_cmd_as_subprocess(cmd)
        os.makedirs(od_path, mode=0o777, exist_ok=True)
    except Exception as exc:
        raise Exception(f"{host_name}: Error in {__file__} prepare_output_dir_r({output_dir})") from exc
Esempio n. 7
0
def prepare_output_dir_squad_r(output_dir, batch_size, max_seq_len):
    host_name = socket.gethostname()
    try:
        od_path = get_canonical_path(output_dir)
        route0 = 0
        route1 = 0
        if os.path.isdir(od_path):
            cfg_path = os.fspath(od_path) + ("/") + (
                f"last_config_{batch_size}_{max_seq_len}")
            if os.path.exists(cfg_path):
                route0 = 1
            else:
                route1 = 1
        else:
            os.makedirs(od_path, exist_ok=True)

        if route0 == 1:
            print(
                f"{host_name}: *** Cleaning temp directory content in {output_dir}... (except *.tf_record files) \n\n"
            )
            with os.scandir(od_path) as it:
                for entry in it:
                    if entry.is_file():
                        if Path(entry.name).suffix != '.tf_record':
                            #os.remove(Path(entry.path))
                            cmd = f"rm -f {get_canonical_path_str(entry.path)}"
                            run_cmd_as_subprocess(cmd)
                    elif entry.is_dir():
                        #shutil.rmtree(get_canonical_path(entry.path))
                        cmd = f"rm -rf {get_canonical_path_str(entry.path)}"
                        run_cmd_as_subprocess(cmd)

        if route1 == 1:
            print(
                f"{host_name}: *** Cleaning temp directory content in {output_dir}... \n\n"
            )
            # This throws an exception when remote hosts share the same file system paths
            #shutil.rmtree(od_path)
            cmd = f"rm -rf {get_canonical_path_str(od_path)}"
            run_cmd_as_subprocess(cmd)
            os.makedirs(od_path, exist_ok=True)

        os.open(get_canonical_path_str(output_dir) + ("/") +
                (f"last_config_{batch_size}_{max_seq_len}"),
                os.O_CREAT,
                mode=0o644)
    except Exception as exc:
        raise Exception(
            f"{host_name}: Error in {__file__} prepare_output_dir_squad_r({output_dir}, {batch_size}, {max_seq_len})"
        ) from exc
Esempio n. 8
0
def download_pretrained_model_r(pretrained_url,
                                pretrained_model,
                                flatten_archive=False):
    host_name = socket.gethostname()
    this_dir = get_canonical_path(os.curdir)
    try:
        os.chdir(Path(__file__).parent.parent)
        if not os.path.isdir(pretrained_model):
            _wget = False
            if os.path.exists(pretrained_model + ".zip") == False:
                _wget = True
            else:
                if os.path.getsize(pretrained_model + ".zip") == 0:
                    print(
                        f"{host_name}: *** Broken file, needs download ...\n\n"
                    )
                    _wget = True
            if _wget == True:
                print(f"{host_name}: *** Downloading pre-trained model...\n\n")
                inf = urllib.request.urlopen(pretrained_url +
                                             pretrained_model + ".zip")
                with open(pretrained_model + ".zip", "wb") as outf:
                    outf.write(inf.read())

            print(f"{host_name}: *** Extracting pre-trained model...\n\n")
            with zipfile.ZipFile(pretrained_model + ".zip", 'r') as zip_ref:
                if flatten_archive:
                    # large model is zipped with subdirectory, flatten archive tree structure
                    for member in zip_ref.infolist():
                        # skip directories
                        if member.is_dir():
                            continue
                        zip_ref.extract(member)
                else:
                    zip_ref.extractall(pretrained_model)

            if _wget == True:
                #os.remove(pretrained_model + ".zip")
                cmd = f"rm -f {pretrained_model}.zip"
                run_cmd_as_subprocess(cmd)
        else:
            print(
                f"{host_name}: Reusing existing pre-trained model directory \'{pretrained_model}\'"
            )
        os.chdir(this_dir)
    except Exception as exc:
        os.chdir(this_dir)
        raise Exception(
            f"{host_name}: Error in {__file__} download_pretrained_model()"
        ) from exc
    def __init__(self, modelname, filename):
        self.model = modelname
        self.hb_config = filename
        self.parsed_config = None
        self.env_variables = None
        self.model_parameters = None
        self.model_parameters_store_true = None

        config_path = get_canonical_path(self.hb_config)
        if config_path.is_file() is False:
            raise OSError(
                f"hb_config has to be existing yaml file, but there is no file {config_path}"
            )

        self.process_config_file(config_path)
def download_dataset_r(dataset_path):
    host_name = socket.gethostname()
    try:
        ds_path = get_canonical_path(dataset_path)
        if not os.path.isdir(ds_path):
            print(f"{host_name}: *** Downloading dataset...\n\n")
            os.makedirs(ds_path, exist_ok=True)
            download_script = Path(__file__).parent.joinpath(
                "download_glue_data.py")
            sys.stdout.flush()
            sys.stderr.flush()
            with subprocess.Popen(
                    f"python3 {str(download_script)} --data_dir {str(ds_path.parent)} --tasks MRPC",
                    shell=True,
                    executable='/bin/bash') as proc:
                proc.wait()
    except Exception as exc:
        raise Exception(
            f"{host_name}: Error in {__file__} download_dataset_r({dataset_path})"
        ) from exc
    def build_for_pretraining_lamb_phase2(self):
        try:
            pretrained_model_path = get_canonical_path(self.pretrained_model)
            bert_config = str(
                pretrained_model_path.joinpath("bert_config.json"))
            PREC = self.set_PREC()
            horovod_str = "--horovod" if self.args.use_horovod is not None else ""

            #PHASE 1 Config
            gbs_phase1 = self.p1_batch_size * self.num_acc_steps_phase1
            PHASE1_CKPT = get_canonical_path(self.results_dir).joinpath(
                "phase_1").joinpath(f"model.ckpt-{self.p1_steps}")

            #PHASE 2
            seq_len = self.p2_max_seq_len
            max_pred_per_seq = 80
            gbs_phase2 = self.p2_batch_size * self.num_acc_steps_phase2

            if self.args.fast_perf_only != 1:
                # Adjust for batch size
                self.p2_steps = int((self.p2_steps * gbs_phase1) / gbs_phase2)

            results_dir_phase2 = self.results_dir + "/" + "phase_2"
            # run_per_ip
            results_phase2_path = get_canonical_path(results_dir_phase2)
            self.prepare_output_dir(results_dir_phase2)
            input_files_path = get_canonical_path(
                self.args.dataset_path).joinpath(
                    f"seq_len_{seq_len}").joinpath("books_wiki_en_corpus/")
            # run_per_ip
            dir_list = ""
            dir_list += str(input_files_path)
            dir_list += " "
            dir_list += str(results_dir_phase2)
            dir_list += " "
            dir_list += bert_config
            dir_list += " "
            dir_list += f"{str(PHASE1_CKPT)}.meta"
            self.check_dirs(dir_list)

            input_files_dir = str(input_files_path.joinpath("training"))
            eval_files_dir = str(input_files_path.joinpath("test"))
            dllog_path = str(results_phase2_path.joinpath("bert_dllog.json"))
            run_pretraining_path = Path(__file__).parent.joinpath(
                "pretraining").joinpath('run_pretraining.py')
            """
            if os.environ.get('MPIRUN_CMD') is not None:
                mpirun_cmd = str(os.environ.get('MPIRUN_CMD'))
            else:
                mpirun_cmd = ''
            """

            print(
                f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}"
            )
            if self.mpirun_cmd == '':
                init_command = f"time python3 {str(run_pretraining_path)}"
            else:
                init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}"
            self.command = (
                f"{init_command}"
                f" --input_files_dir={input_files_dir}"
                f" --init_checkpoint={str(PHASE1_CKPT)}"
                f" --eval_files_dir={eval_files_dir}"
                f" --output_dir={str(results_phase2_path)}"
                f" --bert_config_file={bert_config}"
                f" --do_train=True"
                f" --do_eval=False"
                f" --train_batch_size={self.p2_batch_size}"
                f" --eval_batch_size={self.eval_batch_size}"
                f" --max_seq_length={seq_len}"
                f" --max_predictions_per_seq={max_pred_per_seq}"
                f" --num_train_steps={self.p2_steps}"
                f" --num_accumulation_steps={self.num_acc_steps_phase2}"
                f" --num_warmup_steps={self.p2_warmup}"
                f" --save_checkpoints_steps={self.save_checkpoints_steps}"
                f" --learning_rate={self.learning_rate_phase2}"
                f" {horovod_str} {PREC}"
                f" --allreduce_post_accumulation=True"
                f" --dllog_path={dllog_path}")
            print(
                "-------------------------------------------------------------------------\n"
            )
            print(
                "Running the Pre-Training :: Phase 2: Next Sentence Prediction\n"
            )
            print(
                "-------------------------------------------------------------------------"
            )
            print(
                'bert_pretraining_bookswiki_utils::self.command for Phase2 = ',
                self.command)
        except Exception as exc:
            raise RuntimeError(
                f"Error in {self.__class__.__name__} build_for_pretraining_lamb_phase2()"
            ) from exc
    def build_for_pretraining_lamb_phase1(self):
        try:
            pretrained_model_path = get_canonical_path(self.pretrained_model)
            bert_config = str(
                pretrained_model_path.joinpath("bert_config.json"))
            PREC = self.set_PREC()
            horovod_str = "--horovod" if self.args.use_horovod is not None else ""

            #PHASE 1
            gbs_phase1 = self.p1_batch_size * self.num_acc_steps_phase1
            seq_len = self.p1_max_seq_len
            max_pred_per_seq = 20
            results_dir_phase1 = self.results_dir + "/" + "phase_1"
            # run_per_ip
            results_phase1_path = get_canonical_path(results_dir_phase1)
            self.prepare_output_dir(results_dir_phase1)
            input_files_path = get_canonical_path(
                self.args.dataset_path).joinpath(
                    f"seq_len_{seq_len}").joinpath("books_wiki_en_corpus/")
            # run_per_ip
            dir_list = ""
            dir_list += str(input_files_path)
            dir_list += " "
            dir_list += results_dir_phase1
            dir_list += " "
            dir_list += bert_config
            self.check_dirs(dir_list)

            input_files_dir = str(input_files_path.joinpath("training"))
            eval_files_dir = str(input_files_path.joinpath("test"))
            dllog_path = str(results_phase1_path.joinpath("bert_dllog.json"))
            run_pretraining_path = Path(__file__).parent.joinpath(
                "pretraining").joinpath('run_pretraining.py')

            print(
                f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}"
            )
            if self.mpirun_cmd == '':
                init_command = f"time python3 {str(run_pretraining_path)}"
            else:
                init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}"
            self.command = (
                f"{init_command}"
                f" --input_files_dir={input_files_dir}"
                f" --eval_files_dir={eval_files_dir}"
                f" --output_dir={str(results_phase1_path)}"
                f" --bert_config_file={bert_config}"
                f" --do_train=True"
                f" --do_eval=False"
                f" --train_batch_size={self.p1_batch_size}"
                f" --eval_batch_size={self.eval_batch_size}"
                f" --max_seq_length={seq_len}"
                f" --max_predictions_per_seq={max_pred_per_seq}"
                f" --num_train_steps={self.p1_steps}"
                f" --num_accumulation_steps={self.num_acc_steps_phase1}"
                f" --num_warmup_steps={self.p1_warmup}"
                f" --save_checkpoints_steps={self.save_checkpoints_steps}"
                f" --learning_rate={self.learning_rate_phase1}"
                f" {horovod_str} {PREC}"
                f" --allreduce_post_accumulation=True"
                f" --dllog_path={dllog_path}")
            print(
                "-------------------------------------------------------------------------\n"
            )
            print(
                "Running the Pre-Training :: Phase 1: Masked Language Model\n")
            print(
                "-------------------------------------------------------------------------"
            )
            print(
                'bert_pretraining_bookswiki_utils::self.command for Phase1 = ',
                self.command)
        except Exception as exc:
            raise RuntimeError(
                f"Error in {self.__class__.__name__} build_for_pretraining_lamb_phase1()"
            ) from exc
    def build_command(self):
        try:
            seq_length = self.p1_max_seq_len
            if seq_length == 128:
                max_pred_per_seq = 20
            elif seq_length == 512:
                max_pred_per_seq = 80
            else:
                print(f"Warning: Unsupported max_sequence_length {seq_length}. Setting max_predictions_per_seq to floor(0.15*max_sequence_length). Please see -s parameter for details")
                max_pred_per_seq = math.floor(0.15 * seq_length)

            # run_per_ip
            self.create_pretraining_data(seq_length, max_pred_per_seq)
            sys.stdout.flush()
            sys.stderr.flush()

            horovod_str = "--horovod" if self.args.use_horovod is not None else ""

            # run_per_ip
            self.prepare_results_path(self.results_dir)

            base_lr = 0.006
            num_acc_steps = 1
            learning_rate = float(base_lr * ( self.p1_batch_size * self.num_workers_total * num_acc_steps ) / 65536)
            print(f"learning_rate = {learning_rate}")

            ds_path = str(get_canonical_path(self.dataset_path))
            results_path = get_canonical_path(self.results_dir)
            pretrained_model_path = get_canonical_path(self.pretrained_model)
            bert_config = str(pretrained_model_path.joinpath("bert_config.json"))
            init_checkpoint_path = get_canonical_path(self.args.init_checkpoint_path).joinpath(f"{self.args.model_variant}").joinpath("model.ckpt-0.meta")
            init_checkpoint = str(get_canonical_path(self.args.init_checkpoint_path).joinpath(f"{self.args.model_variant}"))
            init_checkpoint = init_checkpoint + "/" + "model.ckpt-0"
            if os.path.exists(init_checkpoint_path) == False:
                raise Exception(f"Error: init_checkpoint_path {str(init_checkpoint_path)} file or directory missing. Please mount correctly")
            dllog_path = str(results_path.joinpath("bert_dllog.json"))
            run_pretraining_path = Path(__file__).parent.joinpath("pretraining").joinpath('run_pretraining.py')

            print(f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}")
            if self.mpirun_cmd == '':
                init_command = f"time python3 {str(run_pretraining_path)}"
            else:
                init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}"
            self.command = (
                f"{init_command}"
                f" --input_files_dir={ds_path}"
                f" --eval_files_dir={ds_path}"
                f" --output_dir={str(results_path)}"
                f" --do_train=True"
                f" --do_eval=True"
                f" --bert_config_file={bert_config}"
                f" --init_checkpoint={init_checkpoint}"
                f" --train_batch_size={self.p1_batch_size}"
                f" --eval_batch_size={self.eval_batch_size}"
                f" --max_seq_length={seq_length}"
                f" --max_predictions_per_seq={max_pred_per_seq}"
                f" --num_train_steps={self.p1_steps}"
                f" --num_accumulation_steps={num_acc_steps}"
                f" --num_warmup_steps={self.p1_warmup}"
                f" --dllog_path={dllog_path}"
                f" --learning_rate={learning_rate}"
                f" {horovod_str}"
                f" --amp=False"
                f" --use_xla=False"
            )
            print('bert_pretraining_overfit_utils build_command(): self.command = ', self.command)
        except Exception as exc:
            raise RuntimeError(f"Error in {self.__class__.__name__} build_command()") from exc