Esempio n. 1
0
def start_instances(compute, node_list):

    req_cnt = 0
    curr_batch = 0
    batch_list = []
    batch_list.insert(
        curr_batch,
        compute.new_batch_http_request(callback=start_instances_cb))

    for node in node_list:
        if req_cnt >= TOT_REQ_CNT:
            req_cnt = 0
            curr_batch += 1
            batch_list.insert(
                curr_batch,
                compute.new_batch_http_request(callback=start_instances_cb))

        pid = util.get_pid(node)
        batch_list[curr_batch].add(compute.instances().start(
            project=cfg.project, zone=cfg.partitions[pid].zone, instance=node),
                                   request_id=node)
        req_cnt += 1
    try:
        for i, batch in enumerate(batch_list):
            batch.execute()
            if i < (len(batch_list) - 1):
                time.sleep(30)
    except Exception:
        log.exception("error in start batch: ")
Esempio n. 2
0
def main(arg_nodes, arg_job_id):
    log.debug(f"Bursting out: {arg_nodes} {arg_job_id}")
    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True,
                         get_stdout=True).stdout
    node_list = sorted(nodes_str.splitlines(), key=util.get_pid)

    placement_groups = None
    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't create from calls by PrologSlurmctld
        return

    nodes_by_pid = {
        k: tuple(nodes)
        for k, nodes in groupby(node_list, util.get_pid)
    }

    if not arg_job_id:
        for pid in [
                pid for pid in nodes_by_pid if cfg.instance_defs[pid].exclusive
        ]:
            # Node was created by PrologSlurmctld, skip for ResumeProgram.
            del nodes_by_pid[pid]

    if (arg_job_id and cfg.instance_defs[pid].enable_placement):
        if cfg.instance_defs[pid].machine_type.split('-')[0] != "c2":
            msg = "Unsupported placement policy configuration. Please utilize c2 machine type."
            log.error(msg)
            hold_job(arg_job_id, msg)
            os._exit(1)

        elif len(node_list) > 1:
            log.debug(f"creating placement group for {arg_job_id}")
            placement_groups = create_placement_groups(
                arg_job_id, len(node_list), cfg.instance_defs[pid].region)

    def chunks(lst, pg_names):
        """ group list into chunks of max size n """
        n = 1000
        if pg_names:
            n = PLACEMENT_MAX_CNT

        pg_index = 0
        for i in range(0, len(lst), n):
            chunk = dict(nodes=lst[i:i + n])
            if pg_names:
                chunk['pg'] = pg_names[pg_index]
                pg_index += 1
            yield chunk

    # concurrently add nodes grouped by instance_def (pid), max 1000
    with ThreadPoolExecutor() as exe:
        node_chunks = chain.from_iterable(
            map(partial(chunks, pg_names=placement_groups),
                nodes_by_pid.values()))
        exe.map(add_instances, node_chunks)

    log.info(f"done adding instances: {arg_nodes} {arg_job_id}")
Esempio n. 3
0
def main(arg_nodes, arg_job_id):
    log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}")
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              cache_discovery=False)

    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True,
                         get_stdout=True).stdout
    node_list = nodes_str.splitlines()

    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't delete from calls by EpilogSlurmctld
        return

    if arg_job_id:
        # Mark nodes as off limits to new jobs while powering down.
        # Have to use "down" because it's the only, current, way to remove the
        # power_up flag from the node -- followed by a power_down -- if the
        # PrologSlurmctld fails with a non-zero exit code.
        util.run(
            f"{SCONTROL} update node={arg_nodes} state=down reason='{arg_job_id} finishing'"
        )
        # Power down nodes in slurm, so that they will become available again.
        util.run(f"{SCONTROL} update node={arg_nodes} state=power_down")

    while True:
        delete_instances(compute, node_list, arg_job_id)
        if not len(retry_list):
            break

        log.debug("got {} nodes to retry ({})".format(len(retry_list),
                                                      ','.join(retry_list)))
        node_list = list(retry_list)
        del retry_list[:]

    if arg_job_id:
        for operation in operations.values():
            try:
                util.wait_for_operation(compute, cfg.project, operation)
                # now that the instance is gone, resume to put back in service
                util.run(f"{SCONTROL} update node={arg_nodes} state=resume")
            except Exception:
                log.exception(
                    f"Error in deleting {operation['name']} to slurm")

    log.debug("done deleting instances")

    if (arg_job_id and cfg.instance_defs[pid].enable_placement
            and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2"
            and len(node_list) > 1):
        delete_placement_groups(compute, node_list, arg_job_id)

    log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")
Esempio n. 4
0
def add_instances(node_chunk):

    node_list = node_chunk['nodes']
    pg_name = None
    if 'pg' in node_chunk:
        pg_name = node_chunk['pg']
    log.debug(f"node_list:{node_list} pg:{pg_name}")

    auth_http = None
    if not cfg.google_app_cred_path:
        http = set_user_agent(httplib2.Http(),
                              "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)")
        creds = compute_engine.Credentials()
        auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http)
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              http=auth_http,
                                              cache_discovery=False)
    pid = util.get_pid(node_list[0])
    instance_def = cfg.instance_defs[pid]

    try:
        operation = create_instance(compute, instance_def, node_list, pg_name)
    except googleapiclient.errors.HttpError as e:
        log.error(
            f"failed to add {node_list[0]}*{len(node_list)} to slurm, {e}")
        if instance_def.exclusive:
            os._exit(1)
        down_nodes(node_list, e)
        return

    result = util.wait_for_operation(compute, cfg.project, operation)
    if not result or 'error' in result:
        grp_err_msg = result['error']['errors'][0]['message']
        log.error(f"group operation failed: {grp_err_msg}")
        if instance_def.exclusive:
            os._exit(1)

        group_ops = util.get_group_operations(compute, cfg.project, result)
        failed_nodes = {}
        for op in group_ops['items']:
            if op['operationType'] != 'insert':
                continue
            if 'error' in op:
                err_msg = op['error']['errors'][0]['message']
                failed_node = op['targetLink'].split('/')[-1]
                if err_msg not in failed_nodes:
                    failed_nodes[err_msg] = [failed_node]
                else:
                    failed_nodes[err_msg].append(failed_node)
        if failed_nodes:
            log.error(f"insert requests failed: {failed_nodes}")
            for msg, nodes in failed_nodes.items():
                down_nodes(nodes, msg)
Esempio n. 5
0
def delete_instances(compute, node_list, arg_job_id):

    batch_list = []
    curr_batch = 0
    req_cnt = 0
    batch_list.insert(
        curr_batch,
        compute.new_batch_http_request(callback=delete_instances_cb))

    for node_name in node_list:

        pid = util.get_pid(node_name)
        if (not arg_job_id and cfg.instance_defs[pid].exclusive):
            # Node was deleted by EpilogSlurmctld, skip for SuspendProgram
            continue

        if req_cnt >= TOT_REQ_CNT:
            req_cnt = 0
            curr_batch += 1
            batch_list.insert(
                curr_batch,
                compute.new_batch_http_request(callback=delete_instances_cb))

        zone = None
        if cfg.instance_defs[pid].regional_capacity:
            node_find = util.ensure_execute(
                compute.instances().aggregatedList(
                    project=cfg.project, filter=f'name={node_name}'))
            for key, zone_value in node_find['items'].items():
                if 'instances' in zone_value:
                    zone = zone_value['instances'][0]['zone'].split('/')[-1]
                    break
            if zone is None:
                log.error(f"failed to find regional node '{node_name}' to delete")
                continue
        else:
            zone = cfg.instance_defs[pid].zone

        batch_list[curr_batch].add(
            compute.instances().delete(project=cfg.project,
                                       zone=zone,
                                       instance=node_name),
            request_id=node_name)
        req_cnt += 1

    try:
        for i, batch in enumerate(batch_list):
            util.ensure_execute(batch)
            if i < (len(batch_list) - 1):
                time.sleep(30)
    except Exception:
        log.exception("error in batch:")
Esempio n. 6
0
def delete_instances(compute, node_list, arg_job_id):

    batch_list = []
    curr_batch = 0
    req_cnt = 0
    batch_list.insert(
        curr_batch,
        compute.new_batch_http_request(callback=delete_instances_cb))

    def_list = {
        pid: cfg.instance_defs[pid]
        for pid, nodes in groupby(node_list, util.get_pid)
    }
    regional_instances = util.get_regional_instances(compute, cfg.project,
                                                     def_list)

    for node_name in node_list:

        pid = util.get_pid(node_name)
        if (not arg_job_id and cfg.instance_defs[pid].exclusive):
            # Node was deleted by EpilogSlurmctld, skip for SuspendProgram
            continue

        zone = None
        if cfg.instance_defs[pid].regional_capacity:
            instance = regional_instances.get(node_name, None)
            if instance is None:
                log.debug("Regional node not found. Already deleted?")
                continue
            zone = instance['zone'].split('/')[-1]
        else:
            zone = cfg.instance_defs[pid].zone

        if req_cnt >= TOT_REQ_CNT:
            req_cnt = 0
            curr_batch += 1
            batch_list.insert(
                curr_batch,
                compute.new_batch_http_request(callback=delete_instances_cb))

        batch_list[curr_batch].add(compute.instances().delete(
            project=cfg.project, zone=zone, instance=node_name),
                                   request_id=node_name)
        req_cnt += 1

    try:
        for i, batch in enumerate(batch_list):
            util.ensure_execute(batch)
            if i < (len(batch_list) - 1):
                time.sleep(30)
    except Exception:
        log.exception("error in batch:")
Esempio n. 7
0
def delete_placement_groups(compute, node_list, arg_job_id):
    PLACEMENT_MAX_CNT = 22
    pg_ops = []
    pg_index = 0
    pid = util.get_pid(node_list[0])

    for i in range(len(node_list)):
        if i % PLACEMENT_MAX_CNT:
            continue
        pg_index += 1
        pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}'
        pg_ops.append(compute.resourcePolicies().delete(
            project=cfg.project, region=cfg.instance_defs[pid].region,
            resourcePolicy=pg_name).execute())
    for operation in pg_ops:
        util.wait_for_operation(compute, cfg.project, operation)
    log.debug("done deleting pg")
Esempio n. 8
0
def update_slurm_node_addrs(compute):
    for node_name, operation in operations.items():
        try:
            # Do this after the instances have been initialized and then wait
            # for all operations to finish. Then updates their addrs.
            wait_for_operation(compute, cfg.project, operation)

            pid = util.get_pid(node_name)
            my_fields = 'networkInterfaces(name,network,networkIP,subnetwork)'
            instance_networks = compute.instances().get(
                project=cfg.project, zone=cfg.partitions[pid].zone,
                instance=node_name, fields=my_fields).execute()
            instance_ip = instance_networks['networkInterfaces'][0]['networkIP']

            util.run(
                f"{SCONTROL} update node={node_name} nodeaddr={instance_ip}")

            log.info("Instance " + node_name + " is now up")
        except Exception:
            log.exception(f"Error in adding {node_name} to slurm")
Esempio n. 9
0
def get_source_image(compute, node_name):

    images = get_source_image.images
    pid = util.get_pid(node_name)
    if pid not in images:
        image_name = f"{cfg.compute_node_prefix}-{pid}-image"
        family = (cfg.partitions[pid].compute_image_family
                  or f"{image_name}-family")
        try:
            image_response = compute.images().getFromFamily(
                project=cfg.project, family=family).execute()
            if image_response['status'] != 'READY':
                raise Exception("Image not ready")
            source_disk_image = image_response['selfLink']
        except Exception as e:
            log.error(f"Image {family} unavailable: {e}")
            sys.exit()

        images[pid] = source_disk_image
    return images[pid]
Esempio n. 10
0
def start_instances(compute, node_list, gcp_nodes):

    req_cnt = 0
    curr_batch = 0
    batch_list = []
    batch_list.insert(
        curr_batch,
        compute.new_batch_http_request(callback=start_instances_cb))

    for node in node_list:

        pid = util.get_pid(node)
        zone = cfg.instance_defs[pid].zone

        if cfg.instance_defs[pid].regional_capacity:
            g_node = gcp_nodes.get(node, None)
            if not g_node:
                log.error(f"Didn't find regional GCP record for '{node}'")
                continue
            zone = g_node['zone'].split('/')[-1]

        if req_cnt >= TOT_REQ_CNT:
            req_cnt = 0
            curr_batch += 1
            batch_list.insert(
                curr_batch,
                compute.new_batch_http_request(callback=start_instances_cb))

        batch_list[curr_batch].add(
            compute.instances().start(project=cfg.project, zone=zone,
                                      instance=node),
            request_id=node)
        req_cnt += 1
    try:
        for i, batch in enumerate(batch_list):
            util.ensure_execute(batch)
            if i < (len(batch_list) - 1):
                time.sleep(30)
    except Exception:
        log.exception("error in start batch: ")
Esempio n. 11
0
def add_instances(compute, node_list):

    batch_list = []
    curr_batch = 0
    req_cnt = 0
    batch_list.insert(
        curr_batch, compute.new_batch_http_request(callback=added_instances_cb))

    for node_name in node_list:

        if req_cnt >= TOT_REQ_CNT:
            req_cnt = 0
            curr_batch += 1
            batch_list.insert(
                curr_batch,
                compute.new_batch_http_request(callback=added_instances_cb))

        source_disk_image = get_source_image(compute, node_name)

        pid = util.get_pid(node_name)
        batch_list[curr_batch].add(
            create_instance(compute, cfg.partitions[pid].zone,
                            cfg.partitions[pid].machine_type, node_name,
                            source_disk_image),
            request_id=node_name)
        req_cnt += 1

    try:
        for i, batch in enumerate(batch_list):
            batch.execute(http=http)
            if i < (len(batch_list) - 1):
                time.sleep(30)
    except Exception:
        log.exception("error in add batch")

    if cfg.update_node_addrs:
        update_slurm_node_addrs(compute)
Esempio n. 12
0
def main():
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              cache_discovery=False)

    try:
        s_nodes = dict()
        cmd = (f"{SCONTROL} show nodes | "
               r"grep -oP '^NodeName=\K(\S+)|State=\K(\S+)' | "
               "paste -sd',\n'")
        nodes = util.run(cmd, shell=True, check=True, get_stdout=True).stdout
        if nodes:
            # result is a list of tuples like:
            # (nodename, (base='base_state', flags=<set of state flags>))
            # from 'nodename,base_state+flag1+flag2'
            # state flags include: CLOUD, COMPLETING, DRAIN, FAIL, POWER,
            #   POWERING_DOWN
            # Modifiers on base state still include: @ (reboot), $ (maint),
            #   * (nonresponsive), # (powering up)
            StateTuple = collections.namedtuple('StateTuple', 'base,flags')

            def make_state_tuple(state):
                return StateTuple(state[0], set(state[1:]))

            s_nodes = [(node, make_state_tuple(args.split('+')))
                       for node, args in map(lambda x: x.split(','),
                                             nodes.rstrip().splitlines())
                       if 'CLOUD' in args]

        g_nodes = []
        for i, part in enumerate(cfg.partitions):
            page_token = ""
            while True:
                resp = compute.instances().list(
                    project=cfg.project,
                    zone=part.zone,
                    pageToken=page_token,
                    filter=f"name={cfg.compute_node_prefix}-{i}-*").execute()

                if "items" in resp:
                    g_nodes.extend(resp['items'])
                if "nextPageToken" in resp:
                    page_token = resp['nextPageToken']
                    continue

                break

        to_down = []
        to_idle = []
        to_start = []
        for s_node, s_state in s_nodes:
            g_node = next((item for item in g_nodes if item["name"] == s_node),
                          None)
            pid = util.get_pid(s_node)

            if (('POWER' not in s_state.flags)
                    and ('POWERING_DOWN' not in s_state.flags)):
                # slurm nodes that aren't in power_save and are stopped in GCP:
                #   mark down in slurm
                #   start them in gcp
                if g_node and (g_node['status'] == "TERMINATED"):
                    if not s_state.base.startswith('DOWN'):
                        to_down.append(s_node)
                    if (cfg.partitions[pid].preemptible_bursting):
                        to_start.append(s_node)

                # can't check if the node doesn't exist in GCP while the node
                # is booting because it might not have been created yet by the
                # resume script.
                # This should catch the completing states as well.
                if (g_node is None and "#" not in s_state.base
                        and not s_state.base.startswith('DOWN')):
                    to_down.append(s_node)

            elif g_node is None:
                # find nodes that are down~ in slurm and don't exist in gcp:
                #   mark idle~
                if s_state.base.startswith(
                        'DOWN') and 'POWER' in s_state.flags:
                    to_idle.append(s_node)
                elif 'POWERING_DOWN' in s_state.flags:
                    to_idle.append(s_node)
                elif s_state.base.startswith('COMPLETING'):
                    to_down.append(s_node)

        if len(to_down):
            log.info("{} stopped/deleted instances ({})".format(
                len(to_down), ",".join(to_down)))
            log.info("{} instances to start ({})".format(
                len(to_start), ",".join(to_start)))

            # write hosts to a file that can be given to get a slurm
            # hostlist. Since the number of hosts could be large.
            tmp_file = tempfile.NamedTemporaryFile(mode='w+t', delete=False)
            tmp_file.writelines("\n".join(to_down))
            tmp_file.close()
            log.debug("tmp_file = {}".format(tmp_file.name))

            hostlist = util.run(f"{SCONTROL} show hostlist {tmp_file.name}",
                                check=True,
                                get_stdout=True).stdout
            log.debug("hostlist = {}".format(hostlist))
            os.remove(tmp_file.name)

            util.run(f"{SCONTROL} update nodename={hostlist} state=down "
                     "reason='Instance stopped/deleted'")

            while True:
                start_instances(compute, to_start)
                if not len(retry_list):
                    break

                log.debug("got {} nodes to retry ({})".format(
                    len(retry_list), ','.join(retry_list)))
                to_start = list(retry_list)
                del retry_list[:]

        if len(to_idle):
            log.info("{} instances to resume ({})".format(
                len(to_idle), ','.join(to_idle)))

            # write hosts to a file that can be given to get a slurm
            # hostlist. Since the number of hosts could be large.
            tmp_file = tempfile.NamedTemporaryFile(mode='w+t', delete=False)
            tmp_file.writelines("\n".join(to_idle))
            tmp_file.close()
            log.debug("tmp_file = {}".format(tmp_file.name))

            hostlist = util.run(f"{SCONTROL} show hostlist {tmp_file.name}",
                                check=True,
                                get_stdout=True).stdout
            log.debug("hostlist = {}".format(hostlist))
            os.remove(tmp_file.name)

            util.run(f"{SCONTROL} update nodename={hostlist} state=resume")

    except Exception:
        log.exception("failed to sync instances")
Esempio n. 13
0
def create_instance(compute, zone, machine_type, instance_name,
                    source_disk_image):

    pid = util.get_pid(instance_name)
    # Configure the machine
    machine_type_path = f'zones/{zone}/machineTypes/{machine_type}'
    disk_type = 'projects/{}/zones/{}/diskTypes/{}'.format(
        cfg.project, zone, cfg.partitions[pid].compute_disk_type)

    config = {
        'name': instance_name,
        'machineType': machine_type_path,

        # Specify the boot disk and the image to use as a source.
        'disks': [{
            'boot': True,
            'autoDelete': True,
            'initializeParams': {
                'sourceImage': source_disk_image,
                'diskType': disk_type,
                'diskSizeGb': cfg.partitions[pid].compute_disk_size_gb
            }
        }],

        # Specify a network interface
        'networkInterfaces': [{
            'subnetwork': (
                "projects/{}/regions/{}/subnetworks/{}".format(
                    cfg.shared_vpc_host_project or cfg.project,
                    cfg.partitions[pid].region,
                    (cfg.partitions[pid].vpc_subnet
                     or f'{cfg.cluster_name}-{cfg.partitions[pid].region}'))
            ),
        }],

        # Allow the instance to access cloud storage and logging.
        'serviceAccounts': [{
            'email': cfg.compute_node_service_account,
            'scopes': cfg.compute_node_scopes
        }],

        'tags': {'items': ['compute']},

        'metadata': {
            'items': [
                {'key': 'enable-oslogin',
                 'value': 'TRUE'},
                {'key': 'VmDnsSetting',
                 'value': 'GlobalOnly'}
            ]
        }
    }

    shutdown_script_path = Path('/apps/slurm/scripts/compute-shutdown')
    if shutdown_script_path.exists():
        config['metadata']['items'].append({
            'key': 'shutdown-script',
            'value': shutdown_script_path.read_text()
        })

    if cfg.partitions[pid].gpu_type:
        accel_type = ('https://www.googleapis.com/compute/v1/projects/{}/zones/{}/acceleratorTypes/{}'
                      .format(cfg.project, zone,
                              cfg.partitions[pid].gpu_type))
        config['guestAccelerators'] = [{
            'acceleratorCount': cfg.partitions[pid].gpu_count,
            'acceleratorType': accel_type
        }]

        config['scheduling'] = {'onHostMaintenance': 'TERMINATE'}

    if cfg.partitions[pid].preemptible_bursting:
        config['scheduling'] = {
            'preemptible': True,
            'onHostMaintenance': 'TERMINATE',
            'automaticRestart': False
        },

    if cfg.partitions[pid].compute_labels:
        config['labels'] = cfg.partitions[pid].compute_labels,

    if cfg.partitions[pid].cpu_platform:
        config['minCpuPlatform'] = cfg.partitions[pid].cpu_platform,

    if cfg.external_compute_ips:
        config['networkInterfaces'][0]['accessConfigs'] = [
            {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'}
        ]

    return compute.instances().insert(
        project=cfg.project,
        zone=zone,
        body=config)
Esempio n. 14
0
def main():
    compute = googleapiclient.discovery.build('compute', 'v1',
                                              cache_discovery=False)

    try:
        s_nodes = dict()
        cmd = (f"{SCONTROL} show nodes | "
               r"grep -oP '^NodeName=\K(\S+)|State=\K(\S+)' | "
               "paste -sd',\n'")
        nodes = util.run(cmd, shell=True, check=True, get_stdout=True).stdout
        if nodes:
            # result is a list of tuples like:
            # (nodename, (base='base_state', flags=<set of state flags>))
            # from 'nodename,base_state+flag1+flag2'
            # state flags include: CLOUD, COMPLETING, DRAIN, FAIL, POWERED_DOWN,
            #   POWERING_DOWN
            # Modifiers on base state still include: @ (reboot), $ (maint),
            #   * (nonresponsive), # (powering up)
            StateTuple = collections.namedtuple('StateTuple', 'base,flags')

            def make_state_tuple(state):
                return StateTuple(state[0], set(state[1:]))
            s_nodes = {node: make_state_tuple(args.split('+'))
                       for node, args in
                       map(lambda x: x.split(','), nodes.rstrip().splitlines())
                       if 'CLOUD' in args}

        g_nodes = util.get_regional_instances(compute, cfg.project,
                                              cfg.instance_defs)
        for pid, part in cfg.instance_defs.items():
            page_token = ""
            while True:
                if not part.regional_capacity:
                    resp = util.ensure_execute(
                        compute.instances().list(
                            project=cfg.project, zone=part.zone,
                            fields='items(name,zone,status),nextPageToken',
                            pageToken=page_token, filter=f"name={pid}-*"))

                    if "items" in resp:
                        g_nodes.update({instance['name']: instance
                                       for instance in resp['items']})
                    if "nextPageToken" in resp:
                        page_token = resp['nextPageToken']
                        continue

                break

        to_down = []
        to_idle = []
        to_start = []
        for s_node, s_state in s_nodes.items():
            g_node = g_nodes.get(s_node, None)
            pid = util.get_pid(s_node)

            if (('POWERED_DOWN' not in s_state.flags) and
                    ('POWERING_DOWN' not in s_state.flags)):
                # slurm nodes that aren't powered down and are stopped in GCP:
                #   mark down in slurm
                #   start them in gcp
                if g_node and (g_node['status'] == "TERMINATED"):
                    if not s_state.base.startswith('DOWN'):
                        to_down.append(s_node)
                    if cfg.instance_defs[pid].preemptible_bursting != 'false':
                        to_start.append(s_node)

                # can't check if the node doesn't exist in GCP while the node
                # is booting because it might not have been created yet by the
                # resume script.
                # This should catch the completing states as well.
                if (g_node is None and "POWERING_UP" not in s_state.flags and
                        not s_state.base.startswith('DOWN')):
                    to_down.append(s_node)

            elif g_node is None:
                # find nodes that are down~ in slurm and don't exist in gcp:
                #   mark idle~
                if s_state.base.startswith('DOWN') and 'POWERED_DOWN' in s_state.flags:
                    to_idle.append(s_node)
                elif 'POWERING_DOWN' in s_state.flags:
                    to_idle.append(s_node)
                elif s_state.base.startswith('COMPLETING'):
                    to_down.append(s_node)

        if len(to_down):
            log.info("{} stopped/deleted instances ({})".format(
                len(to_down), ",".join(to_down)))
            log.info("{} instances to start ({})".format(
                len(to_start), ",".join(to_start)))
            hostlist = to_hostlist(to_down)

            util.run(f"{SCONTROL} update nodename={hostlist} state=down "
                     "reason='Instance stopped/deleted'")

            while True:
                start_instances(compute, to_start, g_nodes)
                if not len(retry_list):
                    break

                log.debug("got {} nodes to retry ({})"
                          .format(len(retry_list), ','.join(retry_list)))
                to_start = list(retry_list)
                del retry_list[:]

        if len(to_idle):
            log.info("{} instances to resume ({})".format(
                len(to_idle), ','.join(to_idle)))

            hostlist = to_hostlist(to_idle)
            util.run(f"{SCONTROL} update nodename={hostlist} state=resume")

        orphans = [
            inst for inst, info in g_nodes.items()
            if info['status'] == 'RUNNING' and (
                inst not in s_nodes or 'POWERED_DOWN' in s_nodes[inst].flags
            )
        ]
        if orphans:
            if args.debug:
                for orphan in orphans:
                    info = g_nodes.get(orphan)
                    state = s_nodes.get(orphan, None)
                    log.debug(f"orphan {orphan}: status={info['status']} state={state}")
            hostlist = to_hostlist(orphans)
            log.info(f"{len(orphans)} orphan instances found to terminate: {hostlist}")
            util.run(f"{SCRIPTS_DIR}/suspend.py {hostlist}")

    except Exception:
        log.exception("failed to sync instances")
Esempio n. 15
0
    def find_webviews(self, dev):
        if self.appname is None:
            raise Exception("WebGrabber.find_webviews without appname")
        app_pid = util.get_pid(dev, self.appname)
        if app_pid is None:
            return []

        logger.info(
            "forwarding tcp:%d to localabstract:webview_devtools_remote_%s" %
            (fwd_port, app_pid))
        dev.run_adb_cmd(
            "forward", "tcp:%d localabstract:webview_devtools_remote_%s" %
            (fwd_port, app_pid))

        count = 0
        rets = []
        try:
            req = urllib.request.urlopen("http://127.0.0.1:%d/json" % fwd_port)
            ret = json.loads(req.read().decode('utf-8'))
        except:
            logger.warning("fail to connect to webview")
            return []
        for item in ret:
            try:
                desc = json.loads(item['description'])
            except:
                logger.exception("fail to parse description %s",
                                 item['description'])
                raise
            if not desc['attached']:
                continue
            count += 1
            base_x = desc['screenX']
            base_y = desc['screenY']
            if 'width' in desc:
                base_w = desc['width']
                base_h = desc['height']
            else:
                base_w = base_h = 0
            if 'empty' in desc:
                empty = desc['empty']
            else:
                empty = True
            logger.info("found %s %s", item['url'].split('?')[0],
                        item['title'])
            url = item['url']
            ws_url = item['webSocketDebuggerUrl']
            title = item['title']

            rets.append({
                'base_x': base_x,
                'base_y': base_y,
                'base_w': base_w,
                'base_h': base_h,
                'url': url,
                'ws_url': ws_url,
                'title': title,
                'empty': empty
            })
        logger.info("captured %d webviews", count)
        return rets
Esempio n. 16
0
def main(arg_nodes, arg_job_id):
    log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}")
    compute = googleapiclient.discovery.build('compute', 'v1',
                                              cache_discovery=False)

    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True, get_stdout=True).stdout
    node_list = nodes_str.splitlines()

    # Get static node list
    exc_nodes_hostlist = util.run(
        f"{SCONTROL} show config | "
        "awk '/SuspendExcNodes.*=/{print $3}'", shell=True,
        get_stdout=True).stdout
    nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}",
                             check=True, get_stdout=True).stdout
    node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid)

    # Generate new arg_nodes without static nodes
    dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list))
    node_list = dynamic_nodes
    arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes)

    if len(node_list) == 0:
        log.debug(f"Static nodes removed from request. No nodes remain in request.")
        return

    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't delete from calls by EpilogSlurmctld
        return

    if arg_job_id:
        # Mark nodes as off limits to new jobs while powering down.
        # Note: If PrologSlurmctld fails with a non-zero exit code,
        # "powering_up" flag would get stuck on the node. In 20.11 and prior:
        # state=down followed by state=power_down could clear it. In 21.08,
        # state=power_down_force can clear it.
        util.run(
            f"{SCONTROL} update node={arg_nodes} state=power_down_force")

    while True:
        delete_instances(compute, node_list, arg_job_id)
        if not len(retry_list):
            break

        log.debug("got {} nodes to retry ({})"
                  .format(len(retry_list), ','.join(retry_list)))
        node_list = list(retry_list)
        del retry_list[:]

    if arg_job_id:
        for operation in operations.values():
            try:
                util.wait_for_operation(compute, cfg.project, operation)
            except Exception:
                log.exception(f"Error in deleting {operation['name']} to slurm")
        # now that the instances are gone, resume to put back in service
        util.run(f"{SCONTROL} update node={arg_nodes} state=resume")

    log.debug("done deleting instances")

    if (arg_job_id and
            cfg.instance_defs[pid].enable_placement and
            cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and
            len(node_list) > 1):
        delete_placement_groups(compute, node_list, arg_job_id)

    log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")