def do_run_install(args): c = azconfig.ConfigFile() c.open(args.config_file) config = c.preprocess() tmpdir = "azhpc_install_" + os.path.basename(args.config_file)[:-5] log.debug(f"tmpdir = {tmpdir}") if os.path.isdir(tmpdir): log.debug("removing existing tmp directory") shutil.rmtree(tmpdir) adminuser = config["admin_user"] private_key_file = adminuser + "_id_rsa" public_key_file = adminuser + "_id_rsa.pub" start_step = args.step log.info("building host lists") azinstall.generate_hostlists(config, tmpdir) log.info("building install scripts") azinstall.generate_install(config, tmpdir, adminuser, private_key_file, public_key_file) resource_group = c.read_value("resource_group") fqdn = c.get_install_from_destination() log.debug(f"running script from : {fqdn}") azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn, start_step)
def do_build(args): log.debug(f"reading config file ({args.config_file})") tmpdir = "azhpc_install_" + os.path.basename(args.config_file)[:-5] log.debug(f"tmpdir = {tmpdir}") if os.path.isdir(tmpdir): log.debug("removing existing tmp directory") shutil.rmtree(tmpdir) c = azconfig.ConfigFile() c.open(args.config_file) config = c.preprocess() adminuser = config["admin_user"] private_key_file = adminuser + "_id_rsa" public_key_file = adminuser + "_id_rsa.pub" _create_private_key(private_key_file, public_key_file) tpl = arm.ArmTemplate() tpl.read(config, not args.no_vnet) output_template = "deploy_" + args.config_file log.info("writing out arm template to " + output_template) with open(output_template, "w") as f: f.write(tpl.to_json()) log.info("creating resource group " + config["resource_group"]) resource_tags = config.get("resource_tags", {}) azutil.create_resource_group( config["resource_group"], config["location"], [{ "key": "CreatedBy", "value": os.getenv("USER") }, { "key": "CreatedOn", "value": datetime.datetime.now().strftime("%Y%m%d-%H%M%S") }] + [{ "key": key, "value": resource_tags[key] } for key in resource_tags.keys()]) log.info("deploying arm template") deployname = azutil.deploy(config["resource_group"], output_template) log.debug(f"deployment name: {deployname}") _wait_for_deployment(config["resource_group"], deployname) log.info("building host lists") azinstall.generate_hostlists(config, tmpdir) log.info("building install scripts") azinstall.generate_install(config, tmpdir, adminuser, private_key_file, public_key_file) resource_group = c.read_value("resource_group") fqdn = c.get_install_from_destination() log.debug(f"running script from : {fqdn}") azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn)
def do_slurm_resume(args): log.debug(f"reading config file ({args.config_file})") while True: timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tmpdir = "azhpc_install_" + os.path.basename( args.config_file)[:-5] + "_" + timestamp if not os.path.isdir(tmpdir): break log.warning( f"{tmpdir} already exists, sleeping for 5 seconds and retrying") time.sleep(5) log.debug(f"tmpdir = {tmpdir}") c = azconfig.ConfigFile() c.open(args.config_file) config = c.preprocess() adminuser = config["admin_user"] private_key_file = adminuser + "_id_rsa" public_key_file = adminuser + "_id_rsa.pub" log.info(f"slurm resume for {args.nodes}") # first get the resource name all_resources = config.get("resources", []) resource_name, brackets = re.search(r'([^[]*)\[?([\d\-\,]*)\]?', args.nodes).groups(0) resource_list = [] if bool(brackets): for part in brackets.split(","): if "-" in part: lo, hi = part.split("-") assert len(lo) == 4, "expecting number width of 4" assert len(hi) == 4, "expecting number width of 4" for i in range(int(lo), int(hi) + 1): resource_list.append(f"{resource_name}{i:04d}") else: assert len(part) == 4, "expecting number width of 4" resource_list.append(f"{resource_name}{part}") else: resource_list.append(resource_name) resource_name = resource_name[:-4] template_resource = config.get("resources", {}).get(resource_name) if not template_resource: log.error(f"${res} resource not found in config") sys.exit(1) if template_resource.get("type") != "slurm_partition": log.error(f"invalid resource type for scaling") template_resource["type"] = "vm" del template_resource["instances"] log.info(f"resource_name= {resource_name}") log.info("resource_list= " + ",".join(resource_list)) config["resources"] = {} for rname in resource_list: config["resources"][rname] = template_resource tpl = arm.ArmTemplate() tpl.read_resources(config, False) output_template = f"deploy_{args.config_file}_{timestamp}" log.info("writing out arm template to " + output_template) with open(output_template, "w") as f: f.write(tpl.to_json()) log.info("deploying arm template") deployname = azutil.deploy(config["resource_group"], output_template) log.debug(f"deployment name: {deployname}") _wait_for_deployment(config["resource_group"], deployname) log.info("building host lists") azinstall.generate_hostlists(config, tmpdir) log.info("building install scripts") azinstall.generate_install(config, tmpdir, adminuser, private_key_file, public_key_file) jumpbox = c.read_value("install_from") resource_group = c.read_value("resource_group") fqdn = c.get_install_from_destination() log.debug(f"running script from : {fqdn}") azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn)
def do_slurm_resume(args): log.debug(f"reading config file ({args.config_file})") while True: timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tmpdir = "azhpc_install_" + os.path.basename( args.config_file)[:-5] + "_" + timestamp if not os.path.isdir(tmpdir): break log.warning( f"{tmpdir} already exists, sleeping for 5 seconds and retrying") time.sleep(5) log.debug(f"tmpdir = {tmpdir}") c = azconfig.ConfigFile() c.open(args.config_file) config_orig = c.preprocess() adminuser = config_orig["admin_user"] private_key_file = adminuser + "_id_rsa" public_key_file = adminuser + "_id_rsa.pub" log.info(f"slurm resume for {args.nodes}") # first get the resource name resource_names, resource_list = _nodelist_expand(args.nodes) # Create a copy of the configuration to use as template # for the final deployment configuration config = copy.deepcopy(config_orig) config["resources"] = {} # Loop over all resources for resource in resource_names: template_resource = config_orig.get("resources", {}).get(resource) if not template_resource: log.error(f"{resource} resource not found in config") sys.exit(1) if template_resource.get("type") != "slurm_partition": log.error(f"invalid resource type for scaling") template_resource["type"] = "vm" del template_resource["instances"] log.info(f"resource= {resource}") log.info("resource_list= " + ",".join(resource_list)) # Iterate over all nodes which name starts with the resource name # NOTE: It is assumed that in the nodename the resource name is separated # by a hyphen from the node index! for rname in filter(lambda x: x.rsplit('-', 1)[0] == resource, resource_list): config["resources"][rname] = template_resource tpl = arm.ArmTemplate() tpl.read_resources(config, False) output_template = f"deploy_{args.config_file}_{timestamp}" log.info("writing out arm template to " + output_template) with open(output_template, "w") as f: f.write(tpl.to_json()) log.info("deploying arm template") deployname = azutil.deploy(config["resource_group"], output_template) log.debug(f"deployment name: {deployname}") _wait_for_deployment(config["resource_group"], deployname) # remove local scripts config["install"] = [ step for step in config["install"] if step.get("type", "jumpbox_script") == "jumpbox_script" ] log.info("building host lists") azinstall.generate_hostlists(config, tmpdir) log.info("building install scripts") azinstall.generate_install(config, tmpdir, adminuser, private_key_file, public_key_file) if socket.gethostname() == config["install_from"]: fqdn = config["install_from"] else: fqdn = c.get_install_from_destination() log.debug(f"running script from : {fqdn}") azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn)
def do_build(args): log.debug(f"reading config file ({args.config_file})") tmpdir = "azhpc_install_" + os.path.basename( args.config_file).strip(".json") log.debug(f"tmpdir = {tmpdir}") if os.path.isdir(tmpdir): log.debug("removing existing tmp directory") shutil.rmtree(tmpdir) c = azconfig.ConfigFile() c.open(args.config_file) config = c.preprocess() adminuser = config["admin_user"] private_key_file = adminuser + "_id_rsa" public_key_file = adminuser + "_id_rsa.pub" if not (os.path.exists(private_key_file) and os.path.exists(public_key_file)): # create ssh keys key = rsa.generate_private_key(backend=crypto_default_backend(), public_exponent=65537, key_size=2048) private_key = key.private_bytes( crypto_serialization.Encoding.PEM, crypto_serialization.PrivateFormat.TraditionalOpenSSL, crypto_serialization.NoEncryption()) public_key = key.public_key().public_bytes( crypto_serialization.Encoding.OpenSSH, crypto_serialization.PublicFormat.OpenSSH) with open(private_key_file, "wb") as f: os.chmod(private_key_file, 0o600) f.write(private_key) with open(public_key_file, "wb") as f: os.chmod(public_key_file, 0o644) f.write(public_key + b'\n') tpl = arm.ArmTemplate() tpl.read(config) log.info("writing out arm template to " + args.output_template) with open(args.output_template, "w") as f: f.write(tpl.to_json()) log.info("creating resource group " + config["resource_group"]) resource_tags = config.get("resource_tags", {}) azutil.create_resource_group( config["resource_group"], config["location"], [{ "key": "CreatedBy", "value": os.getenv("USER") }, { "key": "CreatedOn", "value": datetime.datetime.now().strftime("%Y%m%d-%H%M%S") }] + [{ "key": key, "value": resource_tags[key] } for key in resource_tags.keys()]) log.info("deploying arm template") deployname = azutil.deploy(config["resource_group"], args.output_template) log.debug(f"deployment name: {deployname}") building = True success = True del_lines = 1 while building: time.sleep(5) res = azutil.get_deployment_status(config["resource_group"], deployname) log.debug(res) print("\033[F" * del_lines) del_lines = 1 for i in res: props = i["properties"] status_code = props["statusCode"] if props.get("targetResource", None): resource_name = props["targetResource"]["resourceName"] resource_type = props["targetResource"]["resourceType"] del_lines += 1 print( f"{resource_name:15} {resource_type:47} {status_code:15}") else: provisioning_state = props["provisioningState"] del_lines += 1 building = False if provisioning_state != "Succeeded": success = False if success: log.info("Provising succeeded") else: log.error("Provisioning failed") for i in res: props = i["properties"] status_code = props["statusCode"] if props.get("targetResource", None): resource_name = props["targetResource"]["resourceName"] if props.get("statusMessage", None): if "error" in props["statusMessage"]: error_code = props["statusMessage"]["error"]["code"] error_message = textwrap.TextWrapper(width=60).wrap( text=props["statusMessage"]["error"]["message"]) error_target = props["statusMessage"]["error"].get( "target", None) error_target_str = "" if error_target: error_target_str = f"({error_target})" print( f" Resource : {resource_name} - {error_code} {error_target_str}" ) print(f" Message : {error_message[0]}") for line in error_message[1:]: print(f" {line}") sys.exit(1) log.info("building host lists") azinstall.generate_hostlists(config, tmpdir) log.info("building install scripts") azinstall.generate_install(config, tmpdir, adminuser, private_key_file, public_key_file) jumpbox = config.get("install_from", None) fqdn = None if jumpbox: fqdn = azutil.get_fqdn(config["resource_group"], jumpbox + "pip") log.info("running install scripts") azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn) else: log.info("nothing to install ('install_from' is not set)")