def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) install_script = '' ami = args.ami # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() job.run('source activate mxnet_p36') job.run('sudo apt install -y fio') job.run('volume=/dev/xvda1') job.run( 'time sudo fio --filename=$volume --rw=read --bs=128k --iodepth=32 --ioengine=libaio --direct=1 --name=volume-initialize' )
def main(): ec2 = u.create_ec2_resource() # ec2 resource ec2_client = u.create_ec2_client() # ec2 client instances = list(ec2.instances.all()) # todo: use filter? region = u.get_region() instances_to_kill = [] for i in instances: name = u.get_name(i.tags) state = i.state['Name'] if not fragment in name: continue if args.skip_tensorboard and '.tb.' in name: continue if args.skip_stopped and state == 'stopped': continue if args.limit_to_key and not (USER_KEY_NAME in i.key_name): continue if state == 'terminated': continue instances_to_kill.append(i) print(u.get_name(i), i.instance_type, i.key_name, state if state == 'stopped' else '') # print extra info if couldn't find anything to kill if not instances_to_kill: valid_names = sorted( list( set("%s,%s" % (u.get_name(i), u.get_state(i)) for i in instances))) from pprint import pprint as pp print("Current instances:") pp(valid_names) print("No running instances found for: Name '%s', key '%s'" % (fragment, USER_KEY_NAME)) if args.skip_tensorboard: print("skipping tensorboard") return action = 'soft terminate' if args.soft else 'terminate' if args.yes: answer = 'y' else: answer = input("%d instances found, %s in %s? (y/N) " % (len(instances_to_kill), action, region)) if not answer: answer = "n" if answer.lower() == "y" or args.yes: instance_ids = [i.id for i in instances_to_kill] if args.delay: print(f"Sleeping for {args.delay} seconds") time.sleep(args.delay) if args.soft: response = ec2_client.stop_instances(InstanceIds=instance_ids) print("soft terminating, got response: %s", response) else: response = ec2_client.terminate_instances(InstanceIds=instance_ids) print("terminating, got response: %s", response) else: print("Didn't get y, doing nothing")
def launch_aws(backend, install_script): region = u.get_region() ami = custom_ami_dict[region] num_tasks = 1 + args.num_workers + args.num_ps run = backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone) ray_job = run.make_job('worker', num_tasks, instance_type=args.gpu_instance_type) tb_job = run.make_job('tb', 1, instance_type=args.tb_instance_type) ray_job.wait_until_ready() tb_job.wait_until_ready() ray_job.run('source activate mxnet_p36') tb_job.run('source activate mxnet_p36') # task 0 is ray head node, also it is client node where main script runs head_task = ray_job.tasks[0] head_task.run('ray stop || echo "ray not started, ignoring"') head_task.run("ray start --head --redis-port=%d --num-gpus=0 \ --num-cpus=10000 --num-workers=10" % (REDIS_PORT, )) for task in ray_job.tasks[1:]: task.run('ray stop || echo "ray not started, ignoring"') task.run( "ray start --redis-address %s:%d --num-gpus=1 --num-cpus=1 --num-workers=0" % (head_task.ip, REDIS_PORT)) head_task.upload(SCRIPT_NAME) # head_task.upload('../util.py') head_task.run_async("python {script} \ --redis-address={redis_ip}:{redis_port} \ --num-workers={num_workers} \ --num-parameter-servers={num_ps} \ --dim={dim} \ --real-model \ --logdir={logdir}".format(script=SCRIPT_NAME, redis_ip=head_task.ip, redis_port=REDIS_PORT, num_workers=args.num_workers, logdir=run.logdir, num_ps=args.num_ps, dim=args.dim)) print("Connect to head node:") print(head_task.connect_instructions) print("Other nodes:") for (i, task) in enumerate(ray_job.tasks[1:]): print(i, task.connect_instructions) tb_cmd = "tensorboard --logdir={logdir} --port=6006".format( logdir=run.logdir) tb_job.run(tb_cmd, sync=False) print("See tensorboard at http://%s:%s" % (tb_job.public_ip, 6006))
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u if args.placement: placement_group = args.run else: placement_group = '' if args.run_local: backend = tmux_backend run = backend.make_run(args.name) else: create_resources_lib.create_resources() region = u.get_region() ami = 'ami-e580c79d' backend = aws_backend run = backend.make_run(args.name, ami=ami, availability_zone=args.zone) job = run.make_job('mpi', instance_type=args.instance, num_tasks=2, placement_group=placement_group) job.wait_until_ready() print( "Job ready for connection, to connect to most recent task, run the following:" ) print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() print("Task internal IPs") for task in job.tasks: print(task.ip) job.upload(__file__) if not args.run_local: job.run('killall python || echo failed') # kill previous run job.run('source activate pytorch_p36') job.tasks[0].run( 'python launch_mpi_test.py --role=worker --rank=0 --size=2 --master-addr=' + job.tasks[0].ip, sync=False) job.tasks[1].run( 'python launch_mpi_test.py --role=worker --rank=1 --size=2 --master-addr=' + job.tasks[0].ip, sync=False)
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) worker_job = run.make_job('worker', instance_type=args.instance_type, num_tasks=2) ps_job = run.make_job('ps', instance_type=args.instance_type, num_tasks=2) worker_job.wait_until_ready() ps_job.wait_until_ready() worker_job.tasks[0].run_async('sudo iperf3 -s -p 6006') worker_job.tasks[1].run('sudo iperf3 -c %s -P 10 -i 1 -t 60 -V -p 6006' % (worker_job.tasks[0].ip, )) print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print()
def mount_efs(self): region = u.get_region() efs_id = u.get_efs_dict()[u.RESOURCE_NAME] dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals()) self.run('sudo mkdir -p /efs') self.run('sudo chmod 777 /efs') self.run( "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs" % (dns, ), ignore_errors=True) # error on remount
def create_resources(): region = u.get_region() print("Creating %s resources in region %s" % ( DEFAULT_NAME, region, )) vpc, security_group = network_setup() keypair = keypair_setup() # saves private key locally to keypair_fn # create EFS efss = u.get_efs_dict() efs_id = efss.get(DEFAULT_NAME, '') if not efs_id: print("Creating EFS " + DEFAULT_NAME) efs_id = u.create_efs(DEFAULT_NAME) else: print("Reusing EFS " + DEFAULT_NAME) efs_client = u.create_efs_client() # create mount target for each subnet in the VPC # added retries because efs is not immediately available MAX_FAILURES = 10 RETRY_INTERVAL_SEC = 1 for subnet in vpc.subnets.all(): for retry_attempt in range(MAX_FAILURES): try: sys.stdout.write("Creating efs mount target for %s ... " % (subnet.availability_zone, )) sys.stdout.flush() response = efs_client.create_mount_target( FileSystemId=efs_id, SubnetId=subnet.id, SecurityGroups=[security_group.id]) if u.is_good_response(response): print("success") break except Exception as e: if 'already exists' in str( e): # ignore "already exists" errors print('already exists') break # Takes couple of seconds for EFS to come online, with # errors like this: # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec print("Got %s, retrying in %s sec" % (str(e), RETRY_INTERVAL_SEC)) time.sleep(RETRY_INTERVAL_SEC) else: print("Giving up.")
def main(): fragment = args.fragment # TODO: prevent CTRL+c/CTRL+d from killing session if not args.skip_tmux: print("Launching into TMUX session, use CTRL+b d to exit") region = u.get_region() client = u.create_ec2_client() ec2 = u.create_ec2_resource() response = client.describe_instances() username = os.environ.get("USERNAME", "ubuntu") print("Using username '%s'" % (username, )) instance_list = [] for instance in ec2.instances.all(): if instance.state['Name'] != 'running': continue name = u.get_name(instance.tags) if (fragment in name or fragment in str(instance.public_ip_address) or fragment in str(instance.id) or fragment in str(instance.private_ip_address)): instance_list.append((u.toseconds(instance.launch_time), instance)) from tzlocal import get_localzone # $ pip install tzlocal filtered_instance_list = u.get_instances(fragment) if not filtered_instance_list: print("no instance id contains fragment '%s'" % (fragment, )) return # connect to most recent instance print(filtered_instance_list) instance = filtered_instance_list[0] print("Connecting to ", u.get_name(instance), " launched ", instance.launch_time.astimezone(get_localzone())) cmd = '' keypair_fn = u.get_keypair_fn() cmd = make_cmd(keypair_fn, username, instance.public_ip_address) print(cmd) result = os.system(cmd) if username == 'ubuntu': username = '******' elif username == 'ec2-user': username = '******' if result != 0: print("ssh failed with code %d, trying username %s" % (result, username)) cmd = make_cmd(keypair_fn, username, instance.public_ip_address) os.system(cmd)
def _mount_efs(self): self.log("Mounting EFS") region = u.get_region() efs_id = u.get_efs_dict()[u.get_resource_name()] dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals()) self.run('sudo mkdir -p /efs') self.run('sudo chmod 777 /efs') # ignore error on remount self.run( "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs" % (dns, ), ignore_errors=True)
def main(): fragment = '' if len(sys.argv) > 1: fragment = sys.argv[1] def get_name(instance_response): names = [ entry['Value'] for entry in instance_response.get('Tags', []) if entry['Key'] == 'Name' ] if not names: names = [''] assert len(names) == 1 return names[0] region = u.get_region() client = boto3.client('ec2', region_name=region) ec2 = boto3.resource('ec2', region_name=region) response = client.describe_instances() username = os.environ.get("EC2_USER", "ubuntu") print("Using username '%s'" % (username, )) instance_list = [] for instance in ec2.instances.all(): if instance.state['Name'] != 'running': continue name = u.get_name(instance.tags) if (fragment in name or fragment in instance.public_ip_address or fragment in instance.id or fragment in instance.private_ip_address): print("Uninitializing %s %s %s" % (name, instance.public_ip_address, instance.private_ip_address)) key_file = u.get_keypair_fn(instance.key_name) ssh_client = u.SshClient(hostname=instance.public_ip_address, ssh_key=key_file, username=username) ssh_client.run('rm /tmp/is_initialized || echo "failed 1"') ssh_client.run('rm /tmp/nv_setup_complete || echo "failed 2"') ssh_client.run('rm *.sh') # remove install scripts
def list_instances(): print("Region", u.get_region()) ec2 = u.create_ec2_resource() instances = [(u.seconds_from_datetime(i.launch_time), i) for i in ec2.instances.all()] sorted_instances = sorted(instances, key=itemgetter(0)) for (seconds, instance) in sorted_instances: hours_ago = (time.time() - seconds) / 3600 hours_ago += 8 # adjust for time being in UTC if instance.state['Name'] not in ('running', 'terminating'): continue if not (LIMIT_TO_KEY in instance.key_name): continue print("%4s %20s %10s %20s %s %s" % (int(hours_ago), u.get_name(instance.tags), instance.instance_type, instance.public_ip_address, instance.private_ip_address, instance.id))
def _mount_efs(self): self.log("Mounting EFS") region = u.get_region() efs_id = u.get_efs_dict()[u.get_resource_name()] dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals()) self.run('sudo mkdir -p /efs') # ignore error on remount (efs already mounted) self.run( "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs" % (dns, ), ignore_errors=True) # make sure chmod is successful, hack to fix occasional permission errors self.run('sudo chmod 777 /efs') while 'drwxrwxrwx' not in self.run_and_capture_output('ls -ld /efs'): print(f"chmod 777 /efs didn't take, retrying in {TIMEOUT_SEC}") time.sleep(TIMEOUT_SEC) self.run('sudo chmod 777 /efs')
def make_job(self, role_name, num_tasks=1, skip_existing_job_validation=False, **kwargs): """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested.""" # u.maybe_create_resources() assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instance_type = kwargs['instance_type'] instances = u.lookup_aws_instances(job_name, instance_type=instance_type) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs.get('ami', '') ami_name = kwargs.get('ami_name', '') availability_zone = kwargs.get('availability_zone', '') if not availability_zone: availability_zone = os.environ['ZONE'] placement_group = kwargs.get('placement_group', '') # automatically generated placement_group_name use_placement_group = kwargs.get('use_placement_group', False) assert use_placement_group == False or placement_group == '' if use_placement_group: placement_group = self.placement_group_name install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') # TODO: use heuristics to tell linux type from AMI name user_data = kwargs.get('user_data', '') if user_data: assert user_data.startswith('#!/bin/bash') ebs = kwargs.get('ebs', '') use_spot = kwargs.get('use_spot', False) monitoring = kwargs.get('monitoring', True) # always install tmux on Amazon linux types # TODO: has no effect for some reason # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en if linux_type == 'amazon': user_data += 'sudo yum install tmux -y' if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: if not skip_existing_job_validation: assert len(instances) == num_tasks, ( "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually." % (job_name, instances[0].state, len(instances), num_tasks)) print("Found existing job " + job_name) starting_instances = False for i in instances: if i.state['Name'] == 'stopped': i.start() starting_instances = True # TODO: replace with proper wait loop if starting_instances: while True: print("Waiting forever for instances to start") time.sleep(10) print(instances) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) assert not ( ami and ami_name ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name assert ami or ami_name, "Must specify at least one of ami and ami_name" if ami_name: ami = u.lookup_ami_id(ami_name).id security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % ( availability_zone, region, ', '.join( subnet_dict.keys()), availability_zone[:-1]) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # storage setup if ebs: args['BlockDeviceMappings'] = ebs # network setup # TODO: get rid of zone? Zone seems to be required for constructor # that allows to enable AssociatePublicIpAddress field args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg if monitoring: args['Monitoring'] = {'Enabled': True} args['UserData'] = user_data if use_spot: instances = u.create_spot_instances(args) else: try: instances = ec2.create_instances(**args) except Exception as e: print(f"Instance creation failed with ({e})") print("Account number: ", u.get_account_number()) print("Region: ", u.get_region()) sys.exit() assert instances assert len(instances) == num_tasks # TODO: make instances match their launch indices. This way # tasks can figure out which # they are for (task_num, instance) in enumerate(instances): while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" # task_name = u.format_task_name(instance.ami_launch_index, role_name, # self.name) task_name = u.format_task_name(task_num, job_name) instance.create_tags(Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def server_job(name, num_tasks=1, instance_type=None, install_script='', placement_group='', ami='', availability_zone='', linux_type=DEFAULT_LINUX_TYPE): """Creates a job on AWS cluster with publicly facing ports. Reuse requires that that job launched previous under same name has identical settings (number of tasks/instace type/placement group) """ global SSH_KEY_PATH DEFAULT_NAME = u.RESOURCE_NAME security_group = u.get_security_group_dict()[DEFAULT_NAME] keypair = u.get_keypair_dict()[DEFAULT_NAME] # get availability zone -> subnet dictionary vpc = u.get_vpc_dict()[DEFAULT_NAME] subnet_dict = {} for subnet in vpc.subnets.all(): zone = subnet.availability_zone assert zone not in subnet_dict, "More than one subnet in %s, why?" % ( zone, ) subnet_dict[zone] = subnet subnet = subnet_dict[availability_zone] global ROOT_INSTALL_SCRIPT if linux_type == 'ubuntu': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_UBUNTU elif linux_type == 'amazon': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_AMAZON else: assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'." if instance_type is None: instance_type = 'c5.large' instances = lookup_aws_instances(name) # todo: get rid of this global variable? SSH_KEY_PATH = "%s/%s-%s.pem" % ( os.environ["HOME"], DEFAULT_NAME, u.get_region(), ) if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number" " of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + name) else: print("Launching new job %s into VPC %s" % (name, DEFAULT_NAME)) ec2 = boto3.resource('ec2') if placement_group: _maybe_create_placement_group(placement_group) print("Requesting %d %s" % (num_tasks, instance_type)) if not ami: ami = os.environ.get('AMI', '') assert ami, "No AMI specified, need AMI env-var or explicit parameter" args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg instances = ec2.create_instances(**args) # todo: use task index in name for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" tag = ec2.create_tags(Resources=[instance.id], Tags=[{ 'Key': 'Name', 'Value': name }]) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) assert len(instances) == num_tasks print('{} Instances created'.format(len(instances))) job = Job(name, instances=instances, install_script=install_script, linux_type=linux_type) return job
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) ami_dict = ami_dict_ubuntu if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] user_data = """#!/bin/bash sudo mkdir -p /efs sudo chmod 777 /efs echo 'Running user-data!' echo 'test' > /home/ubuntu/test.txt echo 'activating pytorch_p36' source /home/ubuntu/anaconda3/bin/activate pytorch_p36 echo $PS1 echo $PS1 > /home/ubuntu/test2.txt pip install ray echo 'INSTALLED ray' echo 'INSTALLED ray' > /home/ubuntu/test3.txt """ # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script='', ami=ami, availability_zone=args.zone, linux_type=args.linux_type, user_data=user_data) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate mxnet_p36') # as of Jan 26, official version gives incompatible numpy error, so pin to nightly # job.run('pip install tensorflow-gpu') # job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl') job.run( 'pip install -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl' ) job.upload(__file__) job.run('killall python || echo failed') # kill previous run job.run_async('python %s --role=worker' % (os.path.basename(__file__)))
def _create_ec2_client(): REGION = u.get_region() return boto3.client('ec2', region_name=REGION)
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('worker', instance_type=args.instance_type) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate tensorflow_p36') job.run('pip install cython') job.run('pip install ray') # below can fail on # E: Could not get lock /var/lib/dpkg/lock - open (11: Resource temporarily unavailable) job.run('sudo apt install htop') job.run('yes | sudo apt-get install google-perftools') job.run('export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"') job.upload(__file__) job.upload('tf_numpy_benchmark.py') job.run('killall python || echo failed') # kill previous run job.run('python tf_numpy_benchmark.py')
def launch(backend, install_script='', init_cmd=''): if args.placement: placement_group = args.name else: placement_group = '' if backend.__name__ == 'aws_backend': ami = ami_dict_ubuntu[u.get_region()] run = backend.make_run(args.name, user_data=install_script, ami=ami, availability_zone=args.zone) worker_job = run.make_job('worker', num_tasks=args.workers, instance_type=args.instance, placement_group=placement_group) # ps_job = run.make_job('ps', num_tasks=args.ps, instance_type=args.instance, placement_group=placement_group) tb_job = run.make_job('tb', instance_type='t2.large') else: # local mode run = backend.make_run(args.name, install_script=install_script) worker_job = run.make_job('worker', args.workers) # ps_job = run.make_job('ps', args.ps) tb_job = run.make_job('tb') for job in run.jobs: job.wait_until_ready() run.upload('tf_adder.py') run.upload('../util.py') def tf_env_setup(task, dense_cluster_spec, task_spec): """Helper method to initialize clusterspec for a task.""" task_type = task_spec['type'] task_id = task_spec['index'] # full cluster spec (needed for estimator) dense_cluster_config = { 'cluster': dense_cluster_spec, 'task': task_spec } TF_CONFIG = json.dumps(dense_cluster_spec) task.run("export TF_CONFIG='%s'" % (TF_CONFIG, )) # construct sparse cluster spec # every worker needs its own location sparse_cluster_spec = defaultdict(dict) host = dense_cluster_spec[task_type][task_id] sparse_cluster_spec[task_type][task_id] = host # gradient workers know about all ps workers if task_type == 'worker': sparse_cluster_spec['ps'] = dense_cluster_spec['ps'] # ps workers know about all gradient workers if task_type == 'ps': sparse_cluster_spec['worker'] = dense_cluster_spec['worker'] # the following spec is required for ps, why? sparse_cluster_spec['ps'] = dense_cluster_spec['ps'] sparse_cluster_config = { 'cluster': sparse_cluster_spec, 'task': task_spec } task.log('sparse_cluster_config %s', sparse_cluster_config) # sparse cluster spec pickle_string = pickle.dumps(sparse_cluster_config) pickle_string_encoded = base64.b16encode(pickle_string) pickle_string_encoded = pickle_string_encoded.decode('ascii') task.run("export TF_PICKLE_BASE16=%s" % (pickle_string_encoded, )) worker_hosts = [ "%s:%d" % (task.ip, task.port) for task in worker_job.tasks ] ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks] cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts} # Launch tensorflow tasks. run.run(init_cmd) tf_cmd = "python tf_adder.py --logdir={logdir} --profile={profile} --ps={ps}".format( logdir=run.logdir, profile=args.profile, ps=args.ps) # ps tasks go first because tensorboard doesn't support multiple processes # creating events in same directory locally (only shows latest created # event file) for task in ps_job.tasks: task_spec = {'type': 'ps', 'index': task.id} tf_env_setup(task, cluster_spec, task_spec) task.run(tf_cmd + ' --label=' + task.job.name + ':' + str(task.id), sync=False) for task in worker_job.tasks: task_spec = {'type': 'worker', 'index': task.id} tf_env_setup(task, cluster_spec, task_spec) task.run(tf_cmd + ' --label=' + task.job.name + ':' + str(task.id), sync=False) # todo: for local runs need to do task.port because multiple tb's # 6006 is hardwired because it's open through the security group tb_port = tb_job.public_port #6006 tb_job.run("tensorboard --logdir={logdir} --port={port}".format( logdir=run.logdir, port=tb_port), sync=False) print("*" * 80) print("See tensorboard at http://%s:%s" % (tb_job.public_ip, tb_port)) print("*" * 80) print(" " * 80) print("Streaming log.txt of worker[0]") worker_job.tasks[0].stream_file('log.txt')
def make_job(self, role_name, num_tasks=1, **kwargs): assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instances = u.lookup_aws_instances(job_name) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs['ami'] instance_type = kwargs['instance_type'] availability_zone = kwargs['availability_zone'] placement_group = kwargs.get('placement_group', '') install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') user_data = kwargs.get('user_data', '') if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + job_name) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % ( availability_zone, region, ', '.join(subnet_dict.keys())) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg args['UserData'] = user_data instances = ec2.create_instances(**args) assert len(instances) == num_tasks # assign proper names to tasks for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" task_name = u.format_task_name( instance.ami_launch_index, role_name, self.name) # TODO: use instance.create_tags instead like in create_resources.py ec2.create_tags(Resources=[instance.id], Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate mxnet_p36') # as of Jan 26, official version gives incompatible numpy error, so pin to nightly # job.run('pip install tensorflow-gpu') # job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl') # job.run('pip install --default-timeout=100 -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl') job.upload(__file__) job.run('killall python || echo failed') # kill previous run job.run_async('python launch.py --role=worker')
def launch(backend, install_script='', init_cmd=''): if args.placement: placement_group = args.name else: placement_group = '' num_tasks = args.workers + args.ps run_local = False if backend.__name__ == 'aws_backend' else True if run_local: run = backend.make_run(args.name, install_script=install_script) job = run.make_job('worker', num_tasks) else: region = u.get_region() assert args.zone.startswith( region ), "Your specified zone is %s but your region (from AWS_DEFAULT_REGION) is %s, please specify zone correctly, such as --zone=%sa" % ( args.zone, region, region) create_resources_lib.create_resources() ami = ami_dict_ubuntu[u.get_region()] run = backend.make_run(args.name, user_data=install_script, ami=ami, availability_zone=args.zone, skip_efs_mount=True) job = run.make_job('worker', num_tasks=num_tasks, instance_type=args.instance, placement_group=placement_group) for job in run.jobs: job.wait_until_ready() head_task = job.tasks[0] # worker 0 is also the head node head_task.upload('ray_adder.py') head_task.upload('../util.py') # just in case? # todo: use task.port instead of DEFAULT_PORT run.run(init_cmd) run.run('ray stop || echo "ignoring error"') if args.omp_threads: run.run('export OMP_NUM_THREADS=' + str(args.omp_threads)) # Ray start for head node. When running locally, specify more gpus since # all workers go on same machine ray_cmd = "ray start --head --redis-port=%d --num-workers=0" % ( DEFAULT_PORT, ) if run_local: ray_cmd += ' --num-gpus=10' else: ray_cmd += ' --num-gpus=1' head_task.run(ray_cmd) # Ray start command for leaf nodes if not run_local: ray_cmd = "ray start --redis-address %s:%d --num-gpus=1 --num-workers=0" % ( head_task.ip, DEFAULT_PORT) for task in job.tasks[1:]: task.run(ray_cmd) client_cmd = 'python ray_adder.py --redis-address %s:%d --size-mb %d' % ( head_task.ip, DEFAULT_PORT, args.size_mb) client_cmd += ' --iters %d --workers %d --ps %d' % (args.iters, args.workers, args.ps) if args.memcpy_threads: client_cmd += ' --memcpy-threads %d' % (args.memcpy_threads, ) if not run_local: client_cmd += ' --enforce-different-ips=1' head_task.run('rm log.txt || echo nevermind') head_task.run(client_cmd, sync=False) log("Streaming log.txt of task[0]") job.tasks[0].stream_file('log.txt')
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') # aws_backend.py is one level up import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u if args.placement: placement_group = args.name else: placement_group = '' if not args.zone: backend = tmux_backend run = backend.make_run(args.name) else: region = u.get_region() print("Using region", region) assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] create_resources_lib.create_resources() region = u.get_region() backend = aws_backend run = backend.make_run(args.name, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('worker', instance_type=args.instance_type, num_tasks=args.num_machines, placement_group=placement_group) job.wait_until_ready() print( "Job ready for connection, to connect to most recent task, run the following:" ) print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() print("Task internal IPs") for task in job.tasks: print(task.ip) job.upload(__file__) if args.zone: job.run('killall python || echo failed') # kill previous run job.run('source activate pytorch_p36') script_name = os.path.basename(__file__) for worker_idx in range(args.num_machines): cmd = 'python %s --role=worker --rank=%d --data-size-mb=%d --num-machines=%d --master-addr=%s' % ( script_name, worker_idx, args.data_size_mb, args.num_machines, job.tasks[0].ip) job.tasks[worker_idx].run(cmd, sync=False)
def _create_ec2_resource(): REGION = u.get_region() return boto3.resource('ec2', region_name=REGION)