Ejemplo n.º 1
0
def do_run_install(args):
    c = azconfig.ConfigFile()
    c.open(args.config_file)
    config = c.preprocess()

    tmpdir = "azhpc_install_" + os.path.basename(args.config_file)[:-5]
    log.debug(f"tmpdir = {tmpdir}")
    if os.path.isdir(tmpdir):
        log.debug("removing existing tmp directory")
        shutil.rmtree(tmpdir)

    adminuser = config["admin_user"]
    private_key_file = adminuser + "_id_rsa"
    public_key_file = adminuser + "_id_rsa.pub"

    start_step = args.step

    log.info("building host lists")
    azinstall.generate_hostlists(config, tmpdir)
    log.info("building install scripts")
    azinstall.generate_install(config, tmpdir, adminuser, private_key_file,
                               public_key_file)

    resource_group = c.read_value("resource_group")
    fqdn = c.get_install_from_destination()
    log.debug(f"running script from : {fqdn}")
    azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file,
                  fqdn, start_step)
Ejemplo n.º 2
0
def do_build(args):
    log.debug(f"reading config file ({args.config_file})")
    tmpdir = "azhpc_install_" + os.path.basename(args.config_file)[:-5]
    log.debug(f"tmpdir = {tmpdir}")
    if os.path.isdir(tmpdir):
        log.debug("removing existing tmp directory")
        shutil.rmtree(tmpdir)

    c = azconfig.ConfigFile()
    c.open(args.config_file)
    config = c.preprocess()

    adminuser = config["admin_user"]
    private_key_file = adminuser + "_id_rsa"
    public_key_file = adminuser + "_id_rsa.pub"
    _create_private_key(private_key_file, public_key_file)

    tpl = arm.ArmTemplate()
    tpl.read(config, not args.no_vnet)

    output_template = "deploy_" + args.config_file

    log.info("writing out arm template to " + output_template)
    with open(output_template, "w") as f:
        f.write(tpl.to_json())

    log.info("creating resource group " + config["resource_group"])

    resource_tags = config.get("resource_tags", {})
    azutil.create_resource_group(
        config["resource_group"], config["location"],
        [{
            "key": "CreatedBy",
            "value": os.getenv("USER")
        }, {
            "key": "CreatedOn",
            "value": datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        }] + [{
            "key": key,
            "value": resource_tags[key]
        } for key in resource_tags.keys()])
    log.info("deploying arm template")
    deployname = azutil.deploy(config["resource_group"], output_template)
    log.debug(f"deployment name: {deployname}")

    _wait_for_deployment(config["resource_group"], deployname)

    log.info("building host lists")
    azinstall.generate_hostlists(config, tmpdir)
    log.info("building install scripts")
    azinstall.generate_install(config, tmpdir, adminuser, private_key_file,
                               public_key_file)

    resource_group = c.read_value("resource_group")
    fqdn = c.get_install_from_destination()
    log.debug(f"running script from : {fqdn}")
    azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file,
                  fqdn)
Ejemplo n.º 3
0
def do_slurm_resume(args):
    log.debug(f"reading config file ({args.config_file})")
    while True:
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        tmpdir = "azhpc_install_" + os.path.basename(
            args.config_file)[:-5] + "_" + timestamp
        if not os.path.isdir(tmpdir):
            break
        log.warning(
            f"{tmpdir} already exists, sleeping for 5 seconds and retrying")
        time.sleep(5)
    log.debug(f"tmpdir = {tmpdir}")

    c = azconfig.ConfigFile()
    c.open(args.config_file)
    config = c.preprocess()

    adminuser = config["admin_user"]
    private_key_file = adminuser + "_id_rsa"
    public_key_file = adminuser + "_id_rsa.pub"

    log.info(f"slurm resume for {args.nodes}")
    # first get the resource name
    all_resources = config.get("resources", [])
    resource_name, brackets = re.search(r'([^[]*)\[?([\d\-\,]*)\]?',
                                        args.nodes).groups(0)
    resource_list = []
    if bool(brackets):
        for part in brackets.split(","):
            if "-" in part:
                lo, hi = part.split("-")
                assert len(lo) == 4, "expecting number width of 4"
                assert len(hi) == 4, "expecting number width of 4"
                for i in range(int(lo), int(hi) + 1):
                    resource_list.append(f"{resource_name}{i:04d}")
            else:
                assert len(part) == 4, "expecting number width of 4"
                resource_list.append(f"{resource_name}{part}")
    else:
        resource_list.append(resource_name)
        resource_name = resource_name[:-4]

    template_resource = config.get("resources", {}).get(resource_name)
    if not template_resource:
        log.error(f"${res} resource not found in config")
        sys.exit(1)
    if template_resource.get("type") != "slurm_partition":
        log.error(f"invalid resource type for scaling")

    template_resource["type"] = "vm"
    del template_resource["instances"]

    log.info(f"resource_name= {resource_name}")
    log.info("resource_list= " + ",".join(resource_list))

    config["resources"] = {}
    for rname in resource_list:
        config["resources"][rname] = template_resource

    tpl = arm.ArmTemplate()
    tpl.read_resources(config, False)

    output_template = f"deploy_{args.config_file}_{timestamp}"

    log.info("writing out arm template to " + output_template)
    with open(output_template, "w") as f:
        f.write(tpl.to_json())

    log.info("deploying arm template")
    deployname = azutil.deploy(config["resource_group"], output_template)
    log.debug(f"deployment name: {deployname}")

    _wait_for_deployment(config["resource_group"], deployname)

    log.info("building host lists")
    azinstall.generate_hostlists(config, tmpdir)
    log.info("building install scripts")
    azinstall.generate_install(config, tmpdir, adminuser, private_key_file,
                               public_key_file)

    jumpbox = c.read_value("install_from")
    resource_group = c.read_value("resource_group")
    fqdn = c.get_install_from_destination()
    log.debug(f"running script from : {fqdn}")
    azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file,
                  fqdn)
Ejemplo n.º 4
0
def do_slurm_resume(args):
    log.debug(f"reading config file ({args.config_file})")
    while True:
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        tmpdir = "azhpc_install_" + os.path.basename(
            args.config_file)[:-5] + "_" + timestamp
        if not os.path.isdir(tmpdir):
            break
        log.warning(
            f"{tmpdir} already exists, sleeping for 5 seconds and retrying")
        time.sleep(5)
    log.debug(f"tmpdir = {tmpdir}")

    c = azconfig.ConfigFile()
    c.open(args.config_file)
    config_orig = c.preprocess()

    adminuser = config_orig["admin_user"]
    private_key_file = adminuser + "_id_rsa"
    public_key_file = adminuser + "_id_rsa.pub"

    log.info(f"slurm resume for {args.nodes}")
    # first get the resource name
    resource_names, resource_list = _nodelist_expand(args.nodes)

    # Create a copy of the configuration to use as template
    # for the final deployment configuration
    config = copy.deepcopy(config_orig)
    config["resources"] = {}

    # Loop over all resources
    for resource in resource_names:
        template_resource = config_orig.get("resources", {}).get(resource)
        if not template_resource:
            log.error(f"{resource} resource not found in config")
            sys.exit(1)
        if template_resource.get("type") != "slurm_partition":
            log.error(f"invalid resource type for scaling")

        template_resource["type"] = "vm"
        del template_resource["instances"]

        log.info(f"resource= {resource}")
        log.info("resource_list= " + ",".join(resource_list))

        # Iterate over all nodes which name starts with the resource name
        # NOTE: It is assumed that in the nodename the resource name is separated
        #       by a hyphen from the node index!
        for rname in filter(lambda x: x.rsplit('-', 1)[0] == resource,
                            resource_list):
            config["resources"][rname] = template_resource

    tpl = arm.ArmTemplate()
    tpl.read_resources(config, False)

    output_template = f"deploy_{args.config_file}_{timestamp}"

    log.info("writing out arm template to " + output_template)
    with open(output_template, "w") as f:
        f.write(tpl.to_json())

    log.info("deploying arm template")
    deployname = azutil.deploy(config["resource_group"], output_template)
    log.debug(f"deployment name: {deployname}")

    _wait_for_deployment(config["resource_group"], deployname)

    # remove local scripts
    config["install"] = [
        step for step in config["install"]
        if step.get("type", "jumpbox_script") == "jumpbox_script"
    ]

    log.info("building host lists")
    azinstall.generate_hostlists(config, tmpdir)
    log.info("building install scripts")
    azinstall.generate_install(config, tmpdir, adminuser, private_key_file,
                               public_key_file)

    if socket.gethostname() == config["install_from"]:
        fqdn = config["install_from"]
    else:
        fqdn = c.get_install_from_destination()

    log.debug(f"running script from : {fqdn}")
    azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file,
                  fqdn)
Ejemplo n.º 5
0
def do_build(args):
    log.debug(f"reading config file ({args.config_file})")
    tmpdir = "azhpc_install_" + os.path.basename(
        args.config_file).strip(".json")
    log.debug(f"tmpdir = {tmpdir}")
    if os.path.isdir(tmpdir):
        log.debug("removing existing tmp directory")
        shutil.rmtree(tmpdir)

    c = azconfig.ConfigFile()
    c.open(args.config_file)
    config = c.preprocess()

    adminuser = config["admin_user"]
    private_key_file = adminuser + "_id_rsa"
    public_key_file = adminuser + "_id_rsa.pub"
    if not (os.path.exists(private_key_file)
            and os.path.exists(public_key_file)):
        # create ssh keys
        key = rsa.generate_private_key(backend=crypto_default_backend(),
                                       public_exponent=65537,
                                       key_size=2048)
        private_key = key.private_bytes(
            crypto_serialization.Encoding.PEM,
            crypto_serialization.PrivateFormat.TraditionalOpenSSL,
            crypto_serialization.NoEncryption())
        public_key = key.public_key().public_bytes(
            crypto_serialization.Encoding.OpenSSH,
            crypto_serialization.PublicFormat.OpenSSH)
        with open(private_key_file, "wb") as f:
            os.chmod(private_key_file, 0o600)
            f.write(private_key)
        with open(public_key_file, "wb") as f:
            os.chmod(public_key_file, 0o644)
            f.write(public_key + b'\n')

    tpl = arm.ArmTemplate()
    tpl.read(config)

    log.info("writing out arm template to " + args.output_template)
    with open(args.output_template, "w") as f:
        f.write(tpl.to_json())

    log.info("creating resource group " + config["resource_group"])

    resource_tags = config.get("resource_tags", {})
    azutil.create_resource_group(
        config["resource_group"], config["location"],
        [{
            "key": "CreatedBy",
            "value": os.getenv("USER")
        }, {
            "key": "CreatedOn",
            "value": datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        }] + [{
            "key": key,
            "value": resource_tags[key]
        } for key in resource_tags.keys()])
    log.info("deploying arm template")
    deployname = azutil.deploy(config["resource_group"], args.output_template)
    log.debug(f"deployment name: {deployname}")

    building = True
    success = True
    del_lines = 1
    while building:
        time.sleep(5)
        res = azutil.get_deployment_status(config["resource_group"],
                                           deployname)
        log.debug(res)

        print("\033[F" * del_lines)
        del_lines = 1

        for i in res:
            props = i["properties"]
            status_code = props["statusCode"]
            if props.get("targetResource", None):
                resource_name = props["targetResource"]["resourceName"]
                resource_type = props["targetResource"]["resourceType"]
                del_lines += 1
                print(
                    f"{resource_name:15} {resource_type:47} {status_code:15}")
            else:
                provisioning_state = props["provisioningState"]
                del_lines += 1
                building = False
                if provisioning_state != "Succeeded":
                    success = False

    if success:
        log.info("Provising succeeded")
    else:
        log.error("Provisioning failed")
        for i in res:
            props = i["properties"]
            status_code = props["statusCode"]
            if props.get("targetResource", None):
                resource_name = props["targetResource"]["resourceName"]
                if props.get("statusMessage", None):
                    if "error" in props["statusMessage"]:
                        error_code = props["statusMessage"]["error"]["code"]
                        error_message = textwrap.TextWrapper(width=60).wrap(
                            text=props["statusMessage"]["error"]["message"])
                        error_target = props["statusMessage"]["error"].get(
                            "target", None)
                        error_target_str = ""
                        if error_target:
                            error_target_str = f"({error_target})"
                        print(
                            f"  Resource : {resource_name} - {error_code} {error_target_str}"
                        )
                        print(f"  Message  : {error_message[0]}")
                        for line in error_message[1:]:
                            print(f"             {line}")
        sys.exit(1)

    log.info("building host lists")
    azinstall.generate_hostlists(config, tmpdir)
    log.info("building install scripts")
    azinstall.generate_install(config, tmpdir, adminuser, private_key_file,
                               public_key_file)

    jumpbox = config.get("install_from", None)
    fqdn = None
    if jumpbox:
        fqdn = azutil.get_fqdn(config["resource_group"], jumpbox + "pip")
        log.info("running install scripts")
        azinstall.run(config, tmpdir, adminuser, private_key_file,
                      public_key_file, fqdn)
    else:
        log.info("nothing to install ('install_from' is not set)")