Esempio n. 1
0
import os
import sys

from runtime.arguments import parse_args
from central.training_run_config import TrainingRunHWConfig


description = "This script is a distributed launcher for unet2d.py " \
              "and accepts the same arguments as orginal unet2d.py script.\n" \
              "In case argument --hvd_workers > 1 is passed, " \
              "it runs 'unet2d.py [ARGS] --use_horovod' via mpirun with generated HCL config.\n"
params = parse_args(description, distributed_launcher=True)

script_to_run = str(
    os.path.abspath(os.path.join(os.path.dirname(__file__), "unet2d.py")))
command_to_run = [sys.executable, script_to_run]
# Prepare mpi command prefix for multinode run
if params.hvd_workers > 1:
    hw_config = TrainingRunHWConfig(scaleout=True,
                                    num_workers_per_hls=params.hvd_workers,
                                    kubernetes_run=params.kubernetes_run,
                                    output_filename="demo_unet2d")
    mpirun_cmd = hw_config.mpirun_cmd.split(" ")
    command_to_run = mpirun_cmd + command_to_run + ["--use_horovod"]
command_to_run += sys.argv[1:]
command_str = ' '.join(command_to_run)

print(f"Running: {command_str}", flush=True)
os.system(command_str)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(add_help=False, usage=argparse.SUPPRESS)
    parser.add_argument("--num_workers_per_hls", default=1, type=int)
    parser.add_argument("--kubernetes_run", default=False, type=bool)
    args, unknown_args = parser.parse_known_args()
    script_to_run = str(
        os.path.abspath(
            os.path.join(os.path.dirname(__file__), "imagenet_main.py")))

    if '--help' in unknown_args or '-h' in unknown_args:
        print(
            """\ndemo_resnext.py is a distributed launcher for imagenet_main.py.
        \nusage: python demo_resnext.py [arguments]
        \noptional arguments:\n

        -dt <data_type>,   --dtype <data_type>                  Data type, possible values: fp32, bf16. Defaults to fp32
        -dlit <data_type>, --data_loader_image_type <data_type> Data loader images output. Should normally be set to the same data_type as the '--dtype' param
        -bs <batch_size>,  --batch_size <batch_size>            Batch size, defaults to 256
        -rs <size>,        --resnet_size <size>                 The size of the ResNet model to use. Defaults to 101.
        -te <epochs>,      --train_epochs <epochs>              Number of training epochs, defaults to 1
        -dd <data_dir>,    --data_dir <data_dir>                Data dir, defaults to `/data/tensorflow_datasets/imagenet/tf_records/`.
                                                                Needs to be specified if the above does not exists.
        -md <model_dir>,   --model_dir <model_dir>              Model dir, defaults to /tmp/resnet
                           --clean                              If set, model_dir will be removed if it exists. Unset by default.
                                                                Important: --clean may return errors in distributed environments. If that happens, try again
        -mts <steps>,      --max_train_steps <steps>            Max train steps
                           --log_steps <steps>                  How often display step status, defaults to 100
        -ebe <epochs>      --epochs_between_evals <epochs>      Number of training epochs between evaluations, defaults to 1.
                                                                To achieve fastest 'time to train', set to the same number as '--train_epochs' to only run one evaluation after the training.
                           --experimental_preloading            Enables support for 'data.experimental.prefetch_to_device' TensorFlow operator.
                                                                Enabled by default - pass --experimental_preloading=False to disable.
                           --num_workers_per_hls <num_workers>  Number of Horovod workers per node. Defaults to 1.
                                                                In case num_workers_per_hls>1, it runs 'resnet_ctl_imagenet_main.py [ARGS] --use_horovod' via mpirun with generated HCL config.
                           --kubernetes_run                     Setup kubernetes run for multi HLS training
        \nexamples:\n
        python demo_resnext.py -bs 64 -rs 50 --clean
        python demo_resnext.py -bs 128 -dt bf16 -te 90
        python demo_resnext.py -bs 128 -dt bf16 -dlit bf16 -te 90 --num_workers_per_hls 8
        \nIn order to see all possible arguments to imagenet_main.py, run "python imagenet_main.py --helpfull"
        """)
        exit(0)

    # libjemalloc for better allocations
    setup_jemalloc()

    if re.search('--experimental_preloading=([f,F]alse|0)', ' '.join(
            map(str, unknown_args))) is None:
        setup_preloading()

    if args.num_workers_per_hls > 1:
        hw_config = TrainingRunHWConfig(
            scaleout=True,
            num_workers_per_hls=args.num_workers_per_hls,
            kubernetes_run=args.kubernetes_run,
            output_filename="demo_resnext_log")
        cmd = hw_config.mpirun_cmd.split(" ") + [
            sys.executable, script_to_run, "--use_horovod"
        ]
    else:
        cmd = [sys.executable, script_to_run]

    cmd.extend(unknown_args)
    cmd_str = ' '.join(map(str, cmd))
    print(f"Running: {cmd_str}", flush=True)
    with subprocess.Popen(cmd_str, shell=True, executable='/bin/bash') as proc:
        proc.wait()
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(add_help=False, usage=argparse.SUPPRESS)
    parser.add_argument("--num_workers_per_hls", default=1, type=int)
    parser.add_argument("--kubernetes_run", default=False, type=bool)
    args, unknown_args = parser.parse_known_args()
    script_to_run = str(
        os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "resnet_ctl_imagenet_main.py")))

    if '--help' in unknown_args or '-h' in unknown_args:
        print(
            """\ndemo_resnet_keras.py is a distributed launcher for resnet_ctl_imagenet_main.py.
        \nusage: python demo_resnet_keras.py [arguments]
        \noptional arguments:\n

        -dt <data_type>,   --dtype <data_type>                     Data type, possible values: fp32, bf16. Defaults to fp32
        -dlit <data_type>, --data_loader_image_type <data_type>    Data loader images output. Should normally be set to the same data_type as the '--dtype' param
        -bs <batch_size>,  --batch_size <batch_size>               Batch size, defaults to 256
        -te <epochs>,      --train_epochs <epochs>                 Number of training epochs, defaults to 1
        -dd <data_dir>,    --data_dir <data_dir>                   Data dir, defaults to `/data/tensorflow_datasets/imagenet/tf_records/`
                                                                   Needs to be specified if the above does not exists
        -md <model_dir>,   --model_dir <model_dir>                 Model dir, defaults to /tmp/resnet
                           --clean                                 If set, model_dir will be removed if it exists. Unset by default
                           --train_steps <steps>                   The maximum number of steps per epoch. Ignored if larger than the number of steps needed to process the training set.
                           --log_steps <steps>                     How often display step status, defaults to 100
                           --steps_per_loop <steps>                Number of steps per training loop. Will be capped at steps per epoch, defaults to 50
                           --enable_checkpoint_and_export          Enables checkpoint callbacks and exports the saved model
                           --enable_tensorboard                    Enables Tensorboard callbacks
        -ebe <epochs>      --epochs_between_evals <epochs>         Number of training epochs between evaluations, defaults to 1.
                                                                   To achieve fastest 'time to train', set to the same number as '--train_epochs' to only run one evaluation after the training.
                           --experimental_preloading               Enables support for 'data.experimental.prefetch_to_device' TensorFlow operator.
                                                                   Enabled by default - pass --experimental_preloading=False to disable.
                           --optimizer <optimizer_type>            Name of optimizer preset, possible values: SGD, LARS. Defaults to SGD
                           --num_workers_per_hls <num_workers>     Number of workers per node. Defaults to 1.
                                                                   In case num_workers_per_hls > 1, it runs 'resnet_ctl_imagenet_main.py [ARGS]' via mpirun with generated HCL config.
                                                                   Must be used together with --use_horovod either --distribution_strategy
                           --use_horovod                           Enable horovod for multicard scenarios
                           --distribution_strategy <strategy>      The Distribution Strategy to use for training. Defaults to off
                           --kubernetes_run                        Setup kubernetes run for multi HLS training
                           --use_keras_mixed_precision             Use native keras mixed precision policy instead of Habana bf16 conversion pass

        \nexamples:\n
        python demo_resnet_keras.py
        python demo_resnet_keras.py -dt bf16 -dlit bf16 -bs 256 -te 90 -ebe 90
        python3 demo_resnet_keras.py --dtype bf16 -dlit bf16 --use_horovod --num_workers_per_hls 8 -te 40 -ebe 40 --optimizer LARS -bs 256
        python3 demo_resnet_keras.py --dtype bf16 -dlit bf16 --distribution_strategy hpu --num_workers_per_hls 8 -bs 256
        \nIn order to see all possible arguments to resnet_ctl_imagenet_main.py, run "python resnet_ctl_imagenet_main.py --helpfull"
        """)
        exit(0)

    # libjemalloc for better allocations
    setup_jemalloc()

    if re.search('--experimental_preloading=([f,F]alse|0)', ' '.join(
            map(str, unknown_args))) is None:
        setup_preloading()

    is_lars_optimizer_set = re.search("--optimizer[= ]*LARS",
                                      ' '.join(map(str,
                                                   unknown_args))) is not None
    if is_lars_optimizer_set:
        set_lars_hyperparams(unknown_args, args)

    if '--horovod_hierarchical_allreduce' in unknown_args:
        os.environ['HOROVOD_HIERARCHICAL_ALLREDUCE'] = "1"

    if args.num_workers_per_hls > 1:
        if '--use_horovod' in unknown_args:
            hw_config = TrainingRunHWConfig(
                scaleout=True,
                num_workers_per_hls=args.num_workers_per_hls,
                kubernetes_run=args.kubernetes_run,
                output_filename="demo_resnet_keras_log")
            cmd = list(hw_config.mpirun_cmd.split(" ")) + [
                sys.executable, script_to_run
            ]
        elif any("--distribution_strategy" in s for s in unknown_args):
            hw_config = TrainingRunHWConfig(
                scaleout=True,
                num_workers_per_hls=args.num_workers_per_hls,
                kubernetes_run=args.kubernetes_run,
                output_filename="demo_resnet_keras_log")
            cmd = list(hw_config.mpirun_cmd.split(" ")) + [
                sys.executable, script_to_run, '--use_tf_while_loop=False'
            ]
        else:
            raise RuntimeError(
                'You need to pass either --use_horovod or --distribution_strategy hpu if num_workers_per_hls>1'
            )
    else:
        cmd = [sys.executable, script_to_run]

    cmd.extend(unknown_args)
    cmd_str = ' '.join(map(str, cmd))
    print(f"Running: {cmd_str}", flush=True)
    with subprocess.Popen(cmd_str, shell=True, executable='/bin/bash') as proc:
        proc.wait()
Esempio n. 4
0
import sys

from central.training_run_config import TrainingRunHWConfig
from TensorFlow.common.common import setup_jemalloc
from TensorFlow.computer_vision.SSD_ResNet34.argparser import SSDArgParser


parser = SSDArgParser(is_demo=True)
args = parser.parse_args()
script_to_run = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "ssd.py"))

setup_jemalloc()     # libjemalloc for better allocations

if args.hvd_workers > 1:
    hw_config = TrainingRunHWConfig(
        scaleout=True,
        num_workers_per_hls=args.num_workers_per_hls,
        kubernetes_run=args.kubernetes_run,
        output_filename="demo_ssd"
    )
    cmd = hw_config.mpirun_cmd.split(" ") + \
        [sys.executable, str(script_to_run), "--use_horovod"]
else:
    cmd = [sys.executable, str(script_to_run)]
cmd += sys.argv[1:]
cmd_str = ' '.join(map(str, cmd))
print(f"Running: {cmd_str}", flush=True)
with subprocess.Popen(cmd_str, shell=True, executable='/bin/bash') as proc:
    proc.wait()
Esempio n. 5
0
        "demo_densenet.py is a distributed launcher for train.py. "
        "It accepts the same arguments as train.py. In case of num_workers_per_hls > 1, "
        "it runs 'train.py [ARGS]' via mpirun with generated HCL config."))
    # special arguments for multi-node/device training
    parser.add_argument('--num_workers_per_hls',
                        type=int,
                        default=1,
                        help="number of workers per HLS")
    parser.add_argument("--hls_type",
                        default="HLS1",
                        type=str,
                        help="type of HLS")
    parser.add_argument("--kubernetes_run",
                        action='store_true',
                        help="whether it's kubernetes run")
    args = parser.parse_args()

    cmd = []
    if args.num_workers_per_hls > 1:
        hw_config = TrainingRunHWConfig(
            scaleout=True,
            num_workers_per_hls=args.num_workers_per_hls,
            hls_type=args.hls_type,
            kubernetes_run=args.kubernetes_run,
            output_filename="demo_densenet_log")
        cmd += hw_config.mpirun_cmd.split(" ")

    cmd += [sys.executable, str(script_to_run)]
    cmd += sys.argv[1:]
    subprocess.run(cmd).check_returncode()
                summary.add_scalar('BLEU', bleu, step)
                print('BLEU:', bleu, flush=True)
            except ValueError:
                print(f'Error when calculating BLEU score for step {step}!', flush=True)

        summary.add_scalar('accuracy', best_bleu, 0)

if __name__ == "__main__":
    args = get_args()
    script_dir = os.path.dirname(__file__)
    if not script_dir:
        script_dir = '.'

    if args.no_hpu:
        print('Running on CPU/GPU')
    else:
        print('Running on HPU')

    if 'train' in args.schedule:
        from central.training_run_config import TrainingRunHWConfig
        hw_config = TrainingRunHWConfig(scaleout=args.hvd_workers > 1, num_workers_per_hls=args.hvd_workers, kubernetes_run=False, output_filename="demo_transformer_log")
        cmd = list(hw_config.mpirun_cmd.split(" ")) + [sys.executable, get_training_cmd(script_dir, args)]
        cmd_str = ' '.join(map(str, cmd))
        cleanup_output_dir(args)
        print(f"Running: {cmd_str}", flush=True)
        with subprocess.Popen(cmd_str, shell=True, executable='/bin/bash') as proc:
                proc.wait()

    if 'calc_bleu' in args.schedule:
        calc_bleu(script_dir, args)
###############################################################################
# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
###############################################################################

import os
import subprocess
import sys

from central.training_run_config import TrainingRunHWConfig
from arguments import CycleGANArgParser

if __name__ == '__main__':
    script_to_run = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "cycle_gan.py"))
    parser = CycleGANArgParser(is_demo=True)
    args = parser.parse_args()
    if args.hvd_workers > 1:
        hw_config = TrainingRunHWConfig(scaleout=True,
                                        num_workers_per_hls=args.hvd_workers,
                                        hls_type=args.hls_type,
                                        kubernetes_run=args.kubernetes_run,
                                        output_filename="cycle_gan")
        cmd = hw_config.mpirun_cmd.split(" ") + \
            [sys.executable, str(script_to_run), "--use_horovod"]
    else:
        cmd = [sys.executable, str(script_to_run)]
    cmd += sys.argv[1:]
    subprocess.run(cmd).check_returncode()
Esempio n. 8
0
        required=False,
        type=int,
        help=
        'Use Horovod for training. num_workers_per_hls parameter is optional and defaults to 8'
    )
    parser.add_argument("--kubernetes_run",
                        default=False,
                        type=bool,
                        help="Kubernetes run")

    args, unknown = parser.parse_known_args()
    script_to_run = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "main.py"))

    if args.help:
        parser.print_help()
        print("main scrpit flags: ")
        print("=" * 30)

    num_workers_per_hls = args.use_horovod if args.use_horovod is not None else 1
    use_horovod = args.use_horovod is not None
    hw_config = TrainingRunHWConfig(scaleout=use_horovod,
                                    num_workers_per_hls=num_workers_per_hls,
                                    kubernetes_run=args.kubernetes_run,
                                    output_filename="demo_efficientdet")
    cmd = hw_config.mpirun_cmd.split(" ") if args.use_horovod else []
    cmd += [sys.executable, str(script_to_run)]
    cmd += sys.argv[1:]

    print(f"Running: {' '.join(map(str, cmd))}")
    subprocess.run(cmd)