Ejemplo n.º 1
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    install_script = ''

    ami = args.ami

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    job.run('source activate mxnet_p36')
    job.run('sudo apt install -y fio')
    job.run('volume=/dev/xvda1')
    job.run(
        'time sudo fio --filename=$volume --rw=read --bs=128k --iodepth=32 --ioengine=libaio --direct=1 --name=volume-initialize'
    )
Ejemplo n.º 2
0
def main():
    ec2 = u.create_ec2_resource()  # ec2 resource
    ec2_client = u.create_ec2_client()  # ec2 client
    instances = list(ec2.instances.all())  # todo: use filter?
    region = u.get_region()

    instances_to_kill = []
    for i in instances:
        name = u.get_name(i.tags)
        state = i.state['Name']
        if not fragment in name:
            continue
        if args.skip_tensorboard and '.tb.' in name:
            continue
        if args.skip_stopped and state == 'stopped':
            continue
        if args.limit_to_key and not (USER_KEY_NAME in i.key_name):
            continue
        if state == 'terminated':
            continue
        instances_to_kill.append(i)
        print(u.get_name(i), i.instance_type, i.key_name,
              state if state == 'stopped' else '')

    # print extra info if couldn't find anything to kill
    if not instances_to_kill:
        valid_names = sorted(
            list(
                set("%s,%s" % (u.get_name(i), u.get_state(i))
                    for i in instances)))
        from pprint import pprint as pp
        print("Current instances:")
        pp(valid_names)
        print("No running instances found for: Name '%s', key '%s'" %
              (fragment, USER_KEY_NAME))
        if args.skip_tensorboard:
            print("skipping tensorboard")
        return

    action = 'soft terminate' if args.soft else 'terminate'
    if args.yes:
        answer = 'y'
    else:
        answer = input("%d instances found, %s in %s? (y/N) " %
                       (len(instances_to_kill), action, region))
    if not answer:
        answer = "n"
    if answer.lower() == "y" or args.yes:
        instance_ids = [i.id for i in instances_to_kill]
        if args.delay:
            print(f"Sleeping for {args.delay} seconds")
            time.sleep(args.delay)
        if args.soft:
            response = ec2_client.stop_instances(InstanceIds=instance_ids)
            print("soft terminating, got response: %s", response)
        else:
            response = ec2_client.terminate_instances(InstanceIds=instance_ids)
            print("terminating, got response: %s", response)
    else:
        print("Didn't get y, doing nothing")
Ejemplo n.º 3
0
def launch_aws(backend, install_script):
    region = u.get_region()
    ami = custom_ami_dict[region]

    num_tasks = 1 + args.num_workers + args.num_ps
    run = backend.make_run(args.name,
                           install_script=install_script,
                           ami=ami,
                           availability_zone=args.zone)
    ray_job = run.make_job('worker',
                           num_tasks,
                           instance_type=args.gpu_instance_type)
    tb_job = run.make_job('tb', 1, instance_type=args.tb_instance_type)
    ray_job.wait_until_ready()
    tb_job.wait_until_ready()

    ray_job.run('source activate mxnet_p36')
    tb_job.run('source activate mxnet_p36')

    # task 0 is ray head node, also it is client node where main script runs
    head_task = ray_job.tasks[0]
    head_task.run('ray stop || echo "ray not started, ignoring"')
    head_task.run("ray start --head --redis-port=%d --num-gpus=0 \
                           --num-cpus=10000 --num-workers=10" % (REDIS_PORT, ))

    for task in ray_job.tasks[1:]:
        task.run('ray stop || echo "ray not started, ignoring"')
        task.run(
            "ray start --redis-address %s:%d --num-gpus=1 --num-cpus=1 --num-workers=0"
            % (head_task.ip, REDIS_PORT))

    head_task.upload(SCRIPT_NAME)
    #  head_task.upload('../util.py')
    head_task.run_async("python {script} \
                    --redis-address={redis_ip}:{redis_port} \
                    --num-workers={num_workers} \
                    --num-parameter-servers={num_ps} \
                    --dim={dim} \
                    --real-model \
                    --logdir={logdir}".format(script=SCRIPT_NAME,
                                              redis_ip=head_task.ip,
                                              redis_port=REDIS_PORT,
                                              num_workers=args.num_workers,
                                              logdir=run.logdir,
                                              num_ps=args.num_ps,
                                              dim=args.dim))
    print("Connect to head node:")
    print(head_task.connect_instructions)

    print("Other nodes:")
    for (i, task) in enumerate(ray_job.tasks[1:]):
        print(i, task.connect_instructions)

    tb_cmd = "tensorboard --logdir={logdir} --port=6006".format(
        logdir=run.logdir)
    tb_job.run(tb_cmd, sync=False)
    print("See tensorboard at http://%s:%s" % (tb_job.public_ip, 6006))
Ejemplo n.º 4
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    if args.placement:
        placement_group = args.run
    else:
        placement_group = ''

    if args.run_local:
        backend = tmux_backend
        run = backend.make_run(args.name)
    else:
        create_resources_lib.create_resources()
        region = u.get_region()
        ami = 'ami-e580c79d'
        backend = aws_backend
        run = backend.make_run(args.name, ami=ami, availability_zone=args.zone)
    job = run.make_job('mpi',
                       instance_type=args.instance,
                       num_tasks=2,
                       placement_group=placement_group)

    job.wait_until_ready()

    print(
        "Job ready for connection, to connect to most recent task, run the following:"
    )
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    print("Task internal IPs")
    for task in job.tasks:
        print(task.ip)

    job.upload(__file__)
    if not args.run_local:
        job.run('killall python || echo failed')  # kill previous run
        job.run('source activate pytorch_p36')

    job.tasks[0].run(
        'python launch_mpi_test.py --role=worker --rank=0 --size=2 --master-addr='
        + job.tasks[0].ip,
        sync=False)
    job.tasks[1].run(
        'python launch_mpi_test.py --role=worker --rank=1 --size=2 --master-addr='
        + job.tasks[0].ip,
        sync=False)
Ejemplo n.º 5
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    worker_job = run.make_job('worker',
                              instance_type=args.instance_type,
                              num_tasks=2)
    ps_job = run.make_job('ps', instance_type=args.instance_type, num_tasks=2)
    worker_job.wait_until_ready()
    ps_job.wait_until_ready()

    worker_job.tasks[0].run_async('sudo iperf3 -s -p 6006')
    worker_job.tasks[1].run('sudo iperf3 -c %s -P 10 -i 1 -t 60 -V -p 6006' %
                            (worker_job.tasks[0].ip, ))
    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()
Ejemplo n.º 6
0
 def mount_efs(self):
     region = u.get_region()
     efs_id = u.get_efs_dict()[u.RESOURCE_NAME]
     dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals())
     self.run('sudo mkdir -p /efs')
     self.run('sudo chmod 777 /efs')
     self.run(
         "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs"
         % (dns, ),
         ignore_errors=True)  # error on remount
Ejemplo n.º 7
0
def create_resources():

    region = u.get_region()
    print("Creating %s resources in region %s" % (
        DEFAULT_NAME,
        region,
    ))

    vpc, security_group = network_setup()
    keypair = keypair_setup()  # saves private key locally to keypair_fn

    # create EFS
    efss = u.get_efs_dict()
    efs_id = efss.get(DEFAULT_NAME, '')
    if not efs_id:
        print("Creating EFS " + DEFAULT_NAME)
        efs_id = u.create_efs(DEFAULT_NAME)
    else:
        print("Reusing EFS " + DEFAULT_NAME)

    efs_client = u.create_efs_client()

    # create mount target for each subnet in the VPC

    # added retries because efs is not immediately available
    MAX_FAILURES = 10
    RETRY_INTERVAL_SEC = 1
    for subnet in vpc.subnets.all():
        for retry_attempt in range(MAX_FAILURES):
            try:
                sys.stdout.write("Creating efs mount target for %s ... " %
                                 (subnet.availability_zone, ))
                sys.stdout.flush()
                response = efs_client.create_mount_target(
                    FileSystemId=efs_id,
                    SubnetId=subnet.id,
                    SecurityGroups=[security_group.id])
                if u.is_good_response(response):
                    print("success")
                    break
            except Exception as e:
                if 'already exists' in str(
                        e):  # ignore "already exists" errors
                    print('already exists')
                    break

                # Takes couple of seconds for EFS to come online, with
                # errors like this:
                # Creating efs mount target for us-east-1f ... Failed with An error occurred (IncorrectFileSystemLifeCycleState) when calling the CreateMountTarget operation: None, retrying in 1 sec

                print("Got %s, retrying in %s sec" %
                      (str(e), RETRY_INTERVAL_SEC))
                time.sleep(RETRY_INTERVAL_SEC)
        else:
            print("Giving up.")
Ejemplo n.º 8
0
def main():
    fragment = args.fragment

    # TODO: prevent CTRL+c/CTRL+d from killing session
    if not args.skip_tmux:
        print("Launching into TMUX session, use CTRL+b d to exit")

    region = u.get_region()
    client = u.create_ec2_client()
    ec2 = u.create_ec2_resource()
    response = client.describe_instances()

    username = os.environ.get("USERNAME", "ubuntu")
    print("Using username '%s'" % (username, ))

    instance_list = []
    for instance in ec2.instances.all():
        if instance.state['Name'] != 'running':
            continue

        name = u.get_name(instance.tags)
        if (fragment in name or fragment in str(instance.public_ip_address)
                or fragment in str(instance.id)
                or fragment in str(instance.private_ip_address)):
            instance_list.append((u.toseconds(instance.launch_time), instance))

    from tzlocal import get_localzone  # $ pip install tzlocal

    filtered_instance_list = u.get_instances(fragment)
    if not filtered_instance_list:
        print("no instance id contains fragment '%s'" % (fragment, ))
        return

    # connect to most recent instance
    print(filtered_instance_list)
    instance = filtered_instance_list[0]
    print("Connecting to ", u.get_name(instance), " launched ",
          instance.launch_time.astimezone(get_localzone()))
    cmd = ''
    keypair_fn = u.get_keypair_fn()
    cmd = make_cmd(keypair_fn, username, instance.public_ip_address)

    print(cmd)
    result = os.system(cmd)
    if username == 'ubuntu':
        username = '******'
    elif username == 'ec2-user':
        username = '******'

    if result != 0:
        print("ssh failed with code %d, trying username %s" %
              (result, username))
    cmd = make_cmd(keypair_fn, username, instance.public_ip_address)
    os.system(cmd)
Ejemplo n.º 9
0
 def _mount_efs(self):
     self.log("Mounting EFS")
     region = u.get_region()
     efs_id = u.get_efs_dict()[u.get_resource_name()]
     dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals())
     self.run('sudo mkdir -p /efs')
     self.run('sudo chmod 777 /efs')
     # ignore error on remount
     self.run(
         "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs"
         % (dns, ),
         ignore_errors=True)
Ejemplo n.º 10
0
def main():
    fragment = ''
    if len(sys.argv) > 1:
        fragment = sys.argv[1]

    def get_name(instance_response):
        names = [
            entry['Value'] for entry in instance_response.get('Tags', [])
            if entry['Key'] == 'Name'
        ]
        if not names:
            names = ['']
        assert len(names) == 1
        return names[0]

    region = u.get_region()
    client = boto3.client('ec2', region_name=region)
    ec2 = boto3.resource('ec2', region_name=region)
    response = client.describe_instances()

    username = os.environ.get("EC2_USER", "ubuntu")
    print("Using username '%s'" % (username, ))

    instance_list = []
    for instance in ec2.instances.all():
        if instance.state['Name'] != 'running':
            continue

        name = u.get_name(instance.tags)
        if (fragment in name or fragment in instance.public_ip_address
                or fragment in instance.id
                or fragment in instance.private_ip_address):

            print("Uninitializing %s %s %s" %
                  (name, instance.public_ip_address,
                   instance.private_ip_address))

            key_file = u.get_keypair_fn(instance.key_name)
            ssh_client = u.SshClient(hostname=instance.public_ip_address,
                                     ssh_key=key_file,
                                     username=username)
            ssh_client.run('rm /tmp/is_initialized || echo "failed 1"')
            ssh_client.run('rm /tmp/nv_setup_complete || echo "failed 2"')
            ssh_client.run('rm *.sh')  # remove install scripts
Ejemplo n.º 11
0
def list_instances():
    print("Region", u.get_region())
    ec2 = u.create_ec2_resource()
    instances = [(u.seconds_from_datetime(i.launch_time), i)
                 for i in ec2.instances.all()]
    sorted_instances = sorted(instances, key=itemgetter(0))

    for (seconds, instance) in sorted_instances:
        hours_ago = (time.time() - seconds) / 3600
        hours_ago += 8  # adjust for time being in UTC
        if instance.state['Name'] not in ('running', 'terminating'):
            continue
        if not (LIMIT_TO_KEY in instance.key_name):
            continue

        print("%4s %20s %10s %20s %s %s" %
              (int(hours_ago), u.get_name(instance.tags),
               instance.instance_type, instance.public_ip_address,
               instance.private_ip_address, instance.id))
Ejemplo n.º 12
0
    def _mount_efs(self):
        self.log("Mounting EFS")
        region = u.get_region()
        efs_id = u.get_efs_dict()[u.get_resource_name()]
        dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals())
        self.run('sudo mkdir -p /efs')

        # ignore error on remount (efs already mounted)
        self.run(
            "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs"
            % (dns, ),
            ignore_errors=True)

        # make sure chmod is successful, hack to fix occasional permission errors
        self.run('sudo chmod 777 /efs')
        while 'drwxrwxrwx' not in self.run_and_capture_output('ls -ld /efs'):
            print(f"chmod 777 /efs didn't take, retrying in {TIMEOUT_SEC}")
            time.sleep(TIMEOUT_SEC)
            self.run('sudo chmod 777 /efs')
Ejemplo n.º 13
0
    def make_job(self,
                 role_name,
                 num_tasks=1,
                 skip_existing_job_validation=False,
                 **kwargs):
        """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested."""

        #    u.maybe_create_resources()

        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instance_type = kwargs['instance_type']
        instances = u.lookup_aws_instances(job_name,
                                           instance_type=instance_type)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs.get('ami', '')
        ami_name = kwargs.get('ami_name', '')
        availability_zone = kwargs.get('availability_zone', '')
        if not availability_zone:
            availability_zone = os.environ['ZONE']
        placement_group = kwargs.get('placement_group', '')

        # automatically generated placement_group_name
        use_placement_group = kwargs.get('use_placement_group', False)
        assert use_placement_group == False or placement_group == ''
        if use_placement_group:
            placement_group = self.placement_group_name

        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        # TODO: use heuristics to tell linux type from AMI name
        user_data = kwargs.get('user_data', '')

        if user_data:
            assert user_data.startswith('#!/bin/bash')

        ebs = kwargs.get('ebs', '')
        use_spot = kwargs.get('use_spot', False)
        monitoring = kwargs.get('monitoring', True)

        # always install tmux on Amazon linux types
        # TODO: has no effect for some reason
        # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en
        if linux_type == 'amazon':
            user_data += 'sudo yum install tmux -y'

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            if not skip_existing_job_validation:
                assert len(instances) == num_tasks, (
                    "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually."
                    %
                    (job_name, instances[0].state, len(instances), num_tasks))

            print("Found existing job " + job_name)
            starting_instances = False
            for i in instances:
                if i.state['Name'] == 'stopped':
                    i.start()
                    starting_instances = True

            # TODO: replace with proper wait loop
            if starting_instances:
                while True:
                    print("Waiting forever for instances to start")
                    time.sleep(10)

            print(instances)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            assert not (
                ami and ami_name
            ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name
            assert ami or ami_name, "Must specify at least one of ami and ami_name"
            if ami_name:
                ami = u.lookup_ami_id(ami_name).id
            security_group = u.get_security_group_dict()[u.get_resource_name()]

            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % (
                availability_zone, region, ', '.join(
                    subnet_dict.keys()), availability_zone[:-1])
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # storage setup
            if ebs: args['BlockDeviceMappings'] = ebs
            # network setup
            # TODO: get rid of zone? Zone seems to be required for constructor
            # that allows to enable AssociatePublicIpAddress field
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}
            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg

            if monitoring: args['Monitoring'] = {'Enabled': True}
            args['UserData'] = user_data

            if use_spot: instances = u.create_spot_instances(args)
            else:
                try:
                    instances = ec2.create_instances(**args)
                except Exception as e:
                    print(f"Instance creation failed with ({e})")
                    print("Account number: ", u.get_account_number())
                    print("Region: ", u.get_region())
                    sys.exit()

            assert instances
            assert len(instances) == num_tasks

            # TODO: make instances match their launch indices. This way
            # tasks can figure out which # they are
            for (task_num, instance) in enumerate(instances):
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        # task_name = u.format_task_name(instance.ami_launch_index, role_name,
                        #                                self.name)
                        task_name = u.format_task_name(task_num, job_name)
                        instance.create_tags(Tags=u.make_name(task_name))

                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
Ejemplo n.º 14
0
def server_job(name,
               num_tasks=1,
               instance_type=None,
               install_script='',
               placement_group='',
               ami='',
               availability_zone='',
               linux_type=DEFAULT_LINUX_TYPE):
    """Creates a job on AWS cluster with publicly facing ports.

  Reuse requires that that job launched previous under same name has identical
  settings (number of tasks/instace type/placement group)
  """

    global SSH_KEY_PATH

    DEFAULT_NAME = u.RESOURCE_NAME
    security_group = u.get_security_group_dict()[DEFAULT_NAME]
    keypair = u.get_keypair_dict()[DEFAULT_NAME]
    # get availability zone -> subnet dictionary
    vpc = u.get_vpc_dict()[DEFAULT_NAME]
    subnet_dict = {}
    for subnet in vpc.subnets.all():
        zone = subnet.availability_zone
        assert zone not in subnet_dict, "More than one subnet in %s, why?" % (
            zone, )
        subnet_dict[zone] = subnet
    subnet = subnet_dict[availability_zone]

    global ROOT_INSTALL_SCRIPT
    if linux_type == 'ubuntu':
        ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_UBUNTU
    elif linux_type == 'amazon':
        ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_AMAZON
    else:
        assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'."

    if instance_type is None:
        instance_type = 'c5.large'
    instances = lookup_aws_instances(name)

    # todo: get rid of this global variable?
    SSH_KEY_PATH = "%s/%s-%s.pem" % (
        os.environ["HOME"],
        DEFAULT_NAME,
        u.get_region(),
    )

    if instances:
        assert len(instances) == num_tasks, (
            "Found job with same name, but number"
            " of tasks %d doesn't match requested %d, kill job manually." %
            (len(instances), num_tasks))
        print("Found existing job " + name)
    else:
        print("Launching new job %s into VPC %s" % (name, DEFAULT_NAME))

        ec2 = boto3.resource('ec2')
        if placement_group:
            _maybe_create_placement_group(placement_group)

        print("Requesting %d %s" % (num_tasks, instance_type))

        if not ami:
            ami = os.environ.get('AMI', '')

        assert ami, "No AMI specified, need AMI env-var or explicit parameter"

        args = {
            'ImageId': ami,
            'InstanceType': instance_type,
            'MinCount': num_tasks,
            'MaxCount': num_tasks,
            'KeyName': keypair.name
        }

        # network setup
        args['NetworkInterfaces'] = [{
            'SubnetId': subnet.id,
            'DeviceIndex': 0,
            'AssociatePublicIpAddress': True,
            'Groups': [security_group.id]
        }]

        placement_arg = {'AvailabilityZone': availability_zone}
        if placement_group: placement_arg['GroupName'] = placement_group
        args['Placement'] = placement_arg

        instances = ec2.create_instances(**args)

        # todo: use task index in name
        for instance in instances:
            while True:
                try:
                    # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                    tag = ec2.create_tags(Resources=[instance.id],
                                          Tags=[{
                                              'Key': 'Name',
                                              'Value': name
                                          }])
                    break
                except Exception as e:
                    self.log(
                        "create_tags failed with %s, retrying in %d seconds" %
                        (str(e), TIMEOUT_SEC))
                    time.sleep(TIMEOUT_SEC)

        assert len(instances) == num_tasks
        print('{} Instances created'.format(len(instances)))

    job = Job(name,
              instances=instances,
              install_script=install_script,
              linux_type=linux_type)
    return job
Ejemplo n.º 15
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    ami_dict = ami_dict_ubuntu

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    user_data = """#!/bin/bash
sudo mkdir -p /efs
sudo chmod 777 /efs
echo 'Running user-data!'
echo 'test' > /home/ubuntu/test.txt
echo 'activating pytorch_p36'
source /home/ubuntu/anaconda3/bin/activate pytorch_p36
echo $PS1
echo $PS1 > /home/ubuntu/test2.txt
pip install ray
echo 'INSTALLED ray'
echo 'INSTALLED ray' > /home/ubuntu/test3.txt
"""

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script='',
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type,
                               user_data=user_data)

    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    job.run(
        'pip install -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl'
    )

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python %s --role=worker' % (os.path.basename(__file__)))
Ejemplo n.º 16
0
def _create_ec2_client():
    REGION = u.get_region()
    return boto3.client('ec2', region_name=REGION)
Ejemplo n.º 17
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('worker', instance_type=args.instance_type)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate tensorflow_p36')
    job.run('pip install cython')
    job.run('pip install ray')
    # below can fail on
    # E: Could not get lock /var/lib/dpkg/lock - open (11: Resource temporarily unavailable)
    job.run('sudo apt install htop')

    job.run('yes | sudo apt-get install google-perftools')
    job.run('export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"')

    job.upload(__file__)
    job.upload('tf_numpy_benchmark.py')
    job.run('killall python || echo failed')  # kill previous run
    job.run('python tf_numpy_benchmark.py')
Ejemplo n.º 18
0
def launch(backend, install_script='', init_cmd=''):
    if args.placement:
        placement_group = args.name
    else:
        placement_group = ''

    if backend.__name__ == 'aws_backend':
        ami = ami_dict_ubuntu[u.get_region()]
        run = backend.make_run(args.name,
                               user_data=install_script,
                               ami=ami,
                               availability_zone=args.zone)
        worker_job = run.make_job('worker',
                                  num_tasks=args.workers,
                                  instance_type=args.instance,
                                  placement_group=placement_group)  #
        ps_job = run.make_job('ps',
                              num_tasks=args.ps,
                              instance_type=args.instance,
                              placement_group=placement_group)
        tb_job = run.make_job('tb', instance_type='t2.large')
    else:  # local mode
        run = backend.make_run(args.name, install_script=install_script)
        worker_job = run.make_job('worker', args.workers)  #
        ps_job = run.make_job('ps', args.ps)
        tb_job = run.make_job('tb')

    for job in run.jobs:
        job.wait_until_ready()

    run.upload('tf_adder.py')
    run.upload('../util.py')

    def tf_env_setup(task, dense_cluster_spec, task_spec):
        """Helper method to initialize clusterspec for a task."""

        task_type = task_spec['type']
        task_id = task_spec['index']

        # full cluster spec (needed for estimator)
        dense_cluster_config = {
            'cluster': dense_cluster_spec,
            'task': task_spec
        }
        TF_CONFIG = json.dumps(dense_cluster_spec)
        task.run("export TF_CONFIG='%s'" % (TF_CONFIG, ))

        # construct sparse cluster spec
        # every worker needs its own location
        sparse_cluster_spec = defaultdict(dict)
        host = dense_cluster_spec[task_type][task_id]
        sparse_cluster_spec[task_type][task_id] = host

        # gradient workers know about all ps workers
        if task_type == 'worker':
            sparse_cluster_spec['ps'] = dense_cluster_spec['ps']

        # ps workers know about all gradient workers
        if task_type == 'ps':
            sparse_cluster_spec['worker'] = dense_cluster_spec['worker']
            # the following spec is required for ps, why?
            sparse_cluster_spec['ps'] = dense_cluster_spec['ps']

        sparse_cluster_config = {
            'cluster': sparse_cluster_spec,
            'task': task_spec
        }
        task.log('sparse_cluster_config %s', sparse_cluster_config)

        # sparse cluster spec
        pickle_string = pickle.dumps(sparse_cluster_config)
        pickle_string_encoded = base64.b16encode(pickle_string)
        pickle_string_encoded = pickle_string_encoded.decode('ascii')
        task.run("export TF_PICKLE_BASE16=%s" % (pickle_string_encoded, ))

    worker_hosts = [
        "%s:%d" % (task.ip, task.port) for task in worker_job.tasks
    ]
    ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks]
    cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts}

    # Launch tensorflow tasks.
    run.run(init_cmd)
    tf_cmd = "python tf_adder.py --logdir={logdir} --profile={profile} --ps={ps}".format(
        logdir=run.logdir, profile=args.profile, ps=args.ps)

    # ps tasks go first because tensorboard doesn't support multiple processes
    # creating events in same directory locally (only shows latest created
    # event file)
    for task in ps_job.tasks:
        task_spec = {'type': 'ps', 'index': task.id}
        tf_env_setup(task, cluster_spec, task_spec)
        task.run(tf_cmd + ' --label=' + task.job.name + ':' + str(task.id),
                 sync=False)

    for task in worker_job.tasks:
        task_spec = {'type': 'worker', 'index': task.id}
        tf_env_setup(task, cluster_spec, task_spec)
        task.run(tf_cmd + ' --label=' + task.job.name + ':' + str(task.id),
                 sync=False)

    # todo: for local runs need to do task.port because multiple tb's
    # 6006 is hardwired because it's open through the security group
    tb_port = tb_job.public_port  #6006
    tb_job.run("tensorboard --logdir={logdir} --port={port}".format(
        logdir=run.logdir, port=tb_port),
               sync=False)
    print("*" * 80)
    print("See tensorboard at http://%s:%s" % (tb_job.public_ip, tb_port))
    print("*" * 80)
    print(" " * 80)

    print("Streaming log.txt of worker[0]")
    worker_job.tasks[0].stream_file('log.txt')
Ejemplo n.º 19
0
    def make_job(self, role_name, num_tasks=1, **kwargs):
        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instances = u.lookup_aws_instances(job_name)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs['ami']
        instance_type = kwargs['instance_type']
        availability_zone = kwargs['availability_zone']
        placement_group = kwargs.get('placement_group', '')
        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        user_data = kwargs.get('user_data', '')

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            assert len(instances) == num_tasks, (
                "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually."
                % (len(instances), num_tasks))
            print("Found existing job " + job_name)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            security_group = u.get_security_group_dict()[u.get_resource_name()]
            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % (
                availability_zone, region, ', '.join(subnet_dict.keys()))
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # network setup
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}

            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg
            args['UserData'] = user_data

            instances = ec2.create_instances(**args)
            assert len(instances) == num_tasks

            # assign proper names to tasks
            for instance in instances:
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        task_name = u.format_task_name(
                            instance.ami_launch_index, role_name, self.name)
                        # TODO: use instance.create_tags instead like in create_resources.py
                        ec2.create_tags(Resources=[instance.id],
                                        Tags=u.make_name(task_name))
                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
Ejemplo n.º 20
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    #  job.run('pip install --default-timeout=100 -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl')

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python launch.py --role=worker')
Ejemplo n.º 21
0
def launch(backend, install_script='', init_cmd=''):
    if args.placement:
        placement_group = args.name
    else:
        placement_group = ''

    num_tasks = args.workers + args.ps
    run_local = False if backend.__name__ == 'aws_backend' else True

    if run_local:
        run = backend.make_run(args.name, install_script=install_script)
        job = run.make_job('worker', num_tasks)
    else:
        region = u.get_region()
        assert args.zone.startswith(
            region
        ), "Your specified zone is %s but your region (from AWS_DEFAULT_REGION) is %s, please specify zone correctly, such as --zone=%sa" % (
            args.zone, region, region)
        create_resources_lib.create_resources()
        ami = ami_dict_ubuntu[u.get_region()]
        run = backend.make_run(args.name,
                               user_data=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               skip_efs_mount=True)
        job = run.make_job('worker',
                           num_tasks=num_tasks,
                           instance_type=args.instance,
                           placement_group=placement_group)

    for job in run.jobs:
        job.wait_until_ready()

    head_task = job.tasks[0]  # worker 0 is also the head node
    head_task.upload('ray_adder.py')
    head_task.upload('../util.py')  # just in case?

    # todo: use task.port instead of DEFAULT_PORT
    run.run(init_cmd)
    run.run('ray stop || echo "ignoring error"')
    if args.omp_threads:
        run.run('export OMP_NUM_THREADS=' + str(args.omp_threads))

    # Ray start for head node. When running locally, specify more gpus since
    # all workers go on same machine
    ray_cmd = "ray start --head --redis-port=%d --num-workers=0" % (
        DEFAULT_PORT, )
    if run_local:
        ray_cmd += ' --num-gpus=10'
    else:
        ray_cmd += ' --num-gpus=1'

    head_task.run(ray_cmd)

    # Ray start command for leaf nodes
    if not run_local:
        ray_cmd = "ray start --redis-address %s:%d --num-gpus=1 --num-workers=0" % (
            head_task.ip, DEFAULT_PORT)
        for task in job.tasks[1:]:
            task.run(ray_cmd)

    client_cmd = 'python ray_adder.py --redis-address %s:%d --size-mb %d' % (
        head_task.ip, DEFAULT_PORT, args.size_mb)
    client_cmd += ' --iters %d --workers %d --ps %d' % (args.iters,
                                                        args.workers, args.ps)
    if args.memcpy_threads:
        client_cmd += ' --memcpy-threads %d' % (args.memcpy_threads, )

    if not run_local:
        client_cmd += ' --enforce-different-ips=1'
    head_task.run('rm log.txt || echo nevermind')
    head_task.run(client_cmd, sync=False)

    log("Streaming log.txt of task[0]")
    job.tasks[0].stream_file('log.txt')
Ejemplo n.º 22
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')  # aws_backend.py is one level up
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    if args.placement:
        placement_group = args.name
    else:
        placement_group = ''

    if not args.zone:
        backend = tmux_backend
        run = backend.make_run(args.name)
    else:
        region = u.get_region()
        print("Using region", region)
        assert args.zone.startswith(
            region
        ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
            args.zone, region)

        if args.ami:
            print(
                "Warning, using provided AMI, make sure that --linux-type argument "
                "is set correctly")
            ami = args.ami
        else:
            assert region in ami_dict, "Define proper AMI mapping for this region."
            ami = ami_dict[region]

        create_resources_lib.create_resources()
        region = u.get_region()
        backend = aws_backend
        run = backend.make_run(args.name,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)

    job = run.make_job('worker',
                       instance_type=args.instance_type,
                       num_tasks=args.num_machines,
                       placement_group=placement_group)
    job.wait_until_ready()

    print(
        "Job ready for connection, to connect to most recent task, run the following:"
    )
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    print("Task internal IPs")
    for task in job.tasks:
        print(task.ip)

    job.upload(__file__)
    if args.zone:
        job.run('killall python || echo failed')  # kill previous run
        job.run('source activate pytorch_p36')

    script_name = os.path.basename(__file__)
    for worker_idx in range(args.num_machines):
        cmd = 'python %s --role=worker --rank=%d --data-size-mb=%d --num-machines=%d --master-addr=%s' % (
            script_name, worker_idx, args.data_size_mb, args.num_machines,
            job.tasks[0].ip)
        job.tasks[worker_idx].run(cmd, sync=False)
Ejemplo n.º 23
0
def _create_ec2_resource():
    REGION = u.get_region()
    return boto3.resource('ec2', region_name=REGION)