def _capture(args: List[str]): api_caller = ApiCaller("https://api.lab-ml.com/api/v1/track?", {'run_uuid': generate_uuid()}) api_logs = ApiLogs() data = {'name': 'Capture', 'comment': ' '.join(args), 'time': time.time()} def _started(url): if url is None: return None logger.log([('Monitor experiment at ', Text.meta), (url, Text.link)]) webbrowser.open(url) api_caller.has_data(SimpleApiDataSource(data, callback=_started)) api_logs.set_api(api_caller, frequency=0) thread = ExecutorThread(' '.join(args), api_logs) thread.start() thread.join() data = { 'rank': 0, 'status': 'completed', 'details': None, 'time': time.time() } api_caller.has_data( SimpleApiDataSource({ 'status': data, 'time': time.time() })) api_caller.stop()
def start_jobs(): n_nodes = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None world_size = n_nodes * PROC_PER_NODE for node_rank, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname for local_rank in range(PROC_PER_NODE): rank = node_rank * PROC_PER_NODE + local_rank env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid, 'MASTER_ADDR': master_addr, 'MASTER_PORT': f'{1234}', 'WORLD_SIZE': f'{world_size}', 'RANK': f'{rank}', 'LOCAL_RANK': f'{local_rank}'} if PROC_PER_NODE > 1: env_vars['OMP_NUM_THREADS'] = '1' cmd = ['python', 'mnist.py'] tags = TAGS.copy() if node_rank == 0 and local_rank == 0: tags += ['master'] JOBS.create(server, ' '.join(cmd), env_vars, tags).start() time.sleep(1)
def _launch(args: List[str]): import sys import os if 'RUN_UUID' not in os.environ: os.environ['RUN_UUID'] = experiment.generate_uuid() cwd = os.getcwd() if 'PYTHONPATH' in os.environ: python_path = os.environ['PYTHONPATH'] print(python_path) os.environ['PYTHONPATH'] = f"{python_path}:{cwd}:{cwd}/src" else: os.environ['PYTHONPATH'] = f"{cwd}:{cwd}/src" cmd = [sys.executable, '-u', '-m', 'torch.distributed.launch', *args] print(cmd) try: process = subprocess.Popen(cmd, env=os.environ) process.wait() except Exception as e: logger.log('Error starting launcher', Text.danger) raise e if process.returncode != 0: logger.log('Launcher failed', Text.danger) raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
def launch(python_cmd: str, *, tags: List[str], n_proc_per_node: int, use_env: bool = False, master_port: int = 1234, env_vars: Optional[Dict[str, str]] = None): n_nodes = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None world_size = n_nodes * n_proc_per_node if env_vars is None: env_vars = {} for node_rank, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname for local_rank in range(n_proc_per_node): rank = node_rank * n_proc_per_node + local_rank proc_env_vars = { 'RUN_UUID': run_uuid, 'MASTER_ADDR': master_addr, 'MASTER_PORT': f'{master_port}', 'WORLD_SIZE': f'{world_size}', 'NODE_RANK': f'{node_rank}', 'RANK': f'{rank}', 'LOCAL_RANK': f'{local_rank}' } if n_proc_per_node > 1: proc_env_vars['OMP_NUM_THREADS'] = '1' proc_env_vars.update(env_vars) cmd = ['python', python_cmd] if not use_env: cmd += [f'--local_rank={local_rank}'] proc_tags = tags.copy() if node_rank == 0 and local_rank == 0: proc_tags += ['master'] JOBS.create(server, ' '.join(cmd), proc_env_vars, proc_tags).start() time.sleep(1)
def _capture(args: List[str]): api_caller = ApiCaller("https://api.labml.ai/api/v1/track?", {'run_uuid': generate_uuid()}, timeout_seconds=120) api_logs = ApiLogs() data = { 'name': 'Capture', 'comment': ' '.join(args), 'time': time.time() } api_caller.add_handler(ApiUrlHandler(True, 'Monitor output at ')) api_caller.has_data(SimpleApiDataSource(data)) api_logs.set_api(api_caller, frequency=0) logger.log('Start capturing...', Text.meta) if args: thread = ExecutorThread(' '.join(args), api_logs) thread.start() thread.join() else: buffer = '' stdin = sys.stdin while stdin.readable(): data = stdin.read(1) if len(data) == 0: break print(data, end='') buffer += data if '\n' in buffer or len(buffer) > 100: api_logs.outputs(stdout_=buffer) buffer = '' if len(buffer) > 0: api_logs.outputs(stdout_=buffer) data = { 'rank': 0, 'status': 'completed', 'details': None, 'time': time.time() } api_caller.has_data(SimpleApiDataSource({ 'status': data, 'time': time.time() })) api_caller.stop()
def main(): if 'RUN_UUID' not in os.environ: os.environ['RUN_UUID'] = experiment.generate_uuid() logger.log(str(sys.argv), Text.danger) cmd = [ sys.executable, '-u', '-m', 'torch.distributed.launch', *sys.argv[1:] ] # print(cmd) try: process = subprocess.Popen(cmd, env=os.environ) # print('wait') process.wait() except Exception as e: logger.log('Error starting launcher', Text.danger) raise e if process.returncode != 0: logger.log('Launcher failed', Text.danger) raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
import os world_size = int(os.environ['WORLD_SIZE']) run_uuid = os.environ['RUN_UUID'] local_rank = int(os.environ['LOCAL_RANK']) rank = int(os.environ['RANK']) inspect(world_size=os.environ['WORLD_SIZE'], run_uuid=os.environ['RUN_UUID'], local_rank=os.environ['LOCAL_RANK'], rank=os.environ['RANK'], master_addr=os.environ['MASTER_ADDR'], master_port=os.environ['MASTER_PORT']) main(local_rank, rank, world_size, run_uuid, 'env://') def spawned(rank, world_size, uuid): main(rank, rank, world_size, uuid) if __name__ == '__main__': # Run single GPU # main(0, 1, experiment.generate_uuid()) # Spawn multiple GPU torch.multiprocessing.spawn(spawned, args=(2, experiment.generate_uuid()), nprocs=2, join=True) # Run with `labml launch`, same arguments as `torch.distributed.launch` # _launcher()
RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=0 --master_addr=104.171.200.181 --master_port=1234 labml_samples/pytorch/ddp/mnist.py RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=1 --master_addr=104.171.200.181 --master_port=1234 labml_samples/pytorch/ddp/mnist.py RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 labml_samples/pytorch/ddp/mnist.py """ import time from labml import experiment from labml_remote.job import JOBS from labml_remote.server import SERVERS PROC_PER_NODE = 1 N_NODES = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None for i, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname cmd = f'python -m torch.distributed.launch ' \ f'--nproc_per_node={PROC_PER_NODE} ' \ f'--nnodes={N_NODES} ' \ f'--node_rank={i} ' \ f'--master_addr={master_addr} --master_port=1234 ' \ f'mnist.py' env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid} tags = ['mnist']