def _run(args): # If LSF is used, use default values from job config if lsf.LSFUtils.using_lsf(): if not args.np: args.np = lsf.LSFUtils.get_num_processes() if not args.hosts and not args.hostfile and not args.host_discovery_script: args.hosts = ','.join('{host}:{np}'.format(host=host, np=lsf.LSFUtils.get_num_gpus()) for host in lsf.LSFUtils.get_compute_hosts()) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts and not args.host_discovery_script: if args.hostfile: args.hosts = hosts.parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) # Convert nics into set args.nics = set(args.nics.split(',')) if args.nics else None if _is_elastic(args): return _run_elastic(args) else: return _run_static(args)
def test_horovodrun_hostfile(self): with temppath() as host_filename: with open(host_filename, 'w+') as fp: fp.write('172.31.32.7 slots=8\n') fp.write('172.31.33.9 slots=8\n') hostnames = hosts.parse_host_files(host_filename) self.assertEqual(hostnames, '172.31.32.7:8,172.31.33.9:8')
def _run(args): # If LSF is used, use default values from job config if lsf.LSFUtils.using_lsf(): if not args.num_proc: args.num_proc = lsf.LSFUtils.get_num_processes() if not args.hosts and not args.hostfile and not args.host_discovery_script: args.hosts = ','.join(f'{host}:{lsf.LSFUtils.get_num_gpus()}' for host in lsf.LSFUtils.get_compute_hosts()) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts and not args.host_discovery_script: if args.hostfile: args.hosts = hosts.parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = f'localhost:{args.num_proc}' if _is_elastic(args): return _run_elastic(args) else: return _run_static(args)