def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help='number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help='number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help='the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help='if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices=['local', 'ssh', 'mpi', 'sge', 'yarn'], help='the launcher to use') parser.add_argument('command', nargs='+', help='command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help = 'number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help = 'number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help = 'the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help = 'if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'], help = 'the launcher to use') parser.add_argument('command', nargs='+', help = 'command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help='number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help='number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help='the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher.\ When -SH is set, the file provided by -H will \ be used to recognize worker machines only. Otherwise, \ -H is used for both server and worker machines.') parser.add_argument('-SH', '--server-hostfile', type=str, help='the hostfile of server machines which will run \ the job. Required for byteps multi-machine launching.') parser.add_argument('--sync-dst-dir', type=str, help='if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices=['local', 'ssh', 'mpi', 'sge', 'yarn'], help='the launcher to use') bps_group = parser.add_argument_group('byteps-backend') bps_group.add_argument('--byteps', action='store_true', help='Whether use byteps launcher to launch') parser.add_argument( '--env-server', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the server processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument( '--env-worker', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the worker processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env', action='append', default=[], help='given a environment variable, passes their \ values from current system to all workers and servers. \ Not necessary when launcher is local as in that case \ all environment variables which are set are copied.') parser.add_argument('--p3', action='store_true', default=False, help='Use P3 distributed training') parser.add_argument('command', nargs='+', help='command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown if args.byteps: import byteps_launcher as bpsl bpsl.submit(args) return if args.num_servers is None: args.num_servers = args.num_workers if args.p3: args.command = ['DMLC_PS_VAN_TYPE=p3 DMLC_PS_WATER_MARK=10' ] + args.command args = dmlc_opts(args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help = 'number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help = 'number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help = 'the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help = 'if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'], help = 'the launcher to use') parser.add_argument('--env-server', action='append', default=[], help = 'Given a pair of environment_variable:value, sets this value of \ environment variable for the server processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env-worker', action='append', default=[], help = 'Given a pair of environment_variable:value, sets this value of \ environment variable for the worker processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env', action='append', default=[], help = 'given a environment variable, passes their \ values from current system to all workers and servers. \ Not necessary when launcher is local as in that case \ all environment variables which are set are copied.') parser.add_argument('command', nargs='+', help = 'command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help='number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help='number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help='the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help='if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices=['local', 'ssh', 'mpi', 'sge', 'yarn'], help='the launcher to use') parser.add_argument( '--env-server', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the server processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument( '--env-worker', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the worker processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env', action='append', default=[], help='given a environment variable, passes their \ values from current system to all workers and servers. \ Not necessary when launcher is local as in that case \ all environment variables which are set are copied.') parser.add_argument('command', nargs='+', help='command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help='number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help='number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help='the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help='if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices=['local', 'ssh', 'mpi', 'sge', 'yarn'], help='the launcher to use') parser.add_argument( '--env-server', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the server processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument( '--env-worker', action='append', default=[], help='Given a pair of environment_variable:value, sets this value of \ environment variable for the worker processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env', action='append', default=[], help='given a environment variable, passes their \ values from current system to all workers and servers. \ Not necessary when launcher is local as in that case \ all environment variables which are set are copied.') parser.add_argument( '--elastic-training-enabled', type=bool, default=False, help=' if this option is set to true, elastic training is enabled. \ If True, you should specify which instance pool to use by using option \ --instance-pool') parser.add_argument('--instance-pool', type=str, default='DEFAULT', help=' You can use ' ' [reservedInstancePoolId | \'spotInstance\', | \'DEFAULT\']' \ 'In case of DEFAULT a file will be created in same folder ' ' where --hostfile lives. The default worker filename will be \'default_worker_file\'') parser.add_argument('--max-elastic-instances', type=int, default=0,help = ' if instance pool is reserved' \ ' or spotInstance, up to max-elastic-instances can be added to existing cluster') parser.add_argument('--launch-worker', type=bool, default=False, help = 'whether this script should' \ 'only launch worker instances') parser.add_argument('--host', type=str, help='host name or ip of new worker host to launch') parser.add_argument( '--port', type=str, default='22', help='port number of new worker for ssh command to run by') parser.add_argument('command', nargs='+', help='command for launching the program') # TODO verify if elastic training enabled is true # verify that --instance-pool is defined , # if --instance-pool is [reserved|spot], verify that --max-elastic-instances is defined # if --instance-pool is DEFAULT then , max_elastic_instance is not defined # launch-worker is true, verify we have host args, unknown = parser.parse_known_args() # if args.hostfile is not None: args.command += unknown logging.info("BEGING %s", args) if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) logging.info("JAHHAHA%s", args) if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) elif args.cluster == 'ssh' and args.launch_worker is True: from dmlc_tracker import ssh logging.info("Vikas dmlc_tracker ssh %s", args) ssh.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh logging.info("Vikas dmlc_tracker ssh %s", args) ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
def main(): parser = argparse.ArgumentParser(description='Launch a distributed job') parser.add_argument('-n', '--num-workers', required=True, type=int, help = 'number of worker nodes to be launched') parser.add_argument('-s', '--num-servers', type=int, help = 'number of server nodes to be launched, \ in default it is equal to NUM_WORKERS') parser.add_argument('-H', '--hostfile', type=str, help = 'the hostfile of slave machines which will run \ the job. Required for ssh and mpi launcher') parser.add_argument('--sync-dst-dir', type=str, help = 'if specificed, it will sync the current \ directory into slave machines\'s SYNC_DST_DIR if ssh \ launcher is used') parser.add_argument('--launcher', type=str, default='ssh', choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'], help = 'the launcher to use') parser.add_argument('--env-server', action='append', default=[], help = 'Given a pair of environment_variable:value, sets this value of \ environment variable for the server processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env-worker', action='append', default=[], help = 'Given a pair of environment_variable:value, sets this value of \ environment variable for the worker processes. This overrides values of \ those environment variable on the machine where this script is run from. \ Example OMP_NUM_THREADS:3') parser.add_argument('--env', action='append', default=[], help = 'given a environment variable, passes their \ values from current system to all workers and servers. \ Not necessary when launcher is local as in that case \ all environment variables which are set are copied.') parser.add_argument('--elastic-training-enabled', type=bool, default=False, help = ' if this option is set to true, elastic training is enabled. \ If True, you should specify which instance pool to use by using option \ --instance-pool') parser.add_argument('--launch-worker', type=bool, default=False, help = 'whether this script should' \ 'only launch worker instances') parser.add_argument('--host', type=str, help='host name or ip of new worker host to launch') parser.add_argument('--port', type=str, default='22', help='port number of new worker for ssh command to run by') parser.add_argument('command', nargs='+', help = 'command for launching the program') args, unknown = parser.parse_known_args() args.command += unknown logging.info("BEGIN args %s", args) if args.num_servers is None: args.num_servers = args.num_workers args = dmlc_opts(args) logging.info("args after dmlc_opts %s", args) if os.getenv('WORKER_LAUNCH_TEMPLATE_ID') is not None and os.getenv('ELASTIC_WORKER_TAG') is not None and args.launch_worker is False : logging.info("Found launch template id and elastic worker tag in environment variable. Will start ET Management thread") thread = Thread(target = manage_elastic_instance, args=(args.worker_host_file, args.num_workers)) thread.setDaemon(True) thread.start() if args.host_file is None or args.host_file == 'None': if args.cluster == 'yarn': from dmlc_tracker import yarn yarn.submit(args) elif args.cluster == 'local': from dmlc_tracker import local local.submit(args) elif args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) elif args.cluster == 'ssh' and args.launch_worker is True: from dmlc_tracker import ssh logging.info("dmlc_tracker ssh %s", args) ssh.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster) else: if args.cluster == 'ssh': from dmlc_tracker import ssh logging.info("dmlc_tracker ssh %s", args) ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)