def create_availability_set(config, args): subscription = "--subscription \"{}\"".format( config["azure_cluster"] ["subscription"]) if "subscription" in config["azure_cluster"] else "" availability_sets = set() if "availability_set" in config["azure_cluster"]: availability_sets.add(config["azure_cluster"]["availability_set"]) for vmname, spec in config["machines"].items(): if "availability_set" in spec: availability_sets.add(spec["availability_set"]) listcmd = "az vm availability-set list --resource-group {} --query \"[].name\"".format( config["azure_cluster"]["resource_group"]) as_res = execute_or_dump_locally(listcmd, args.verbose, False, args.output) try: existing_as = set(json.loads(as_res)) availability_sets -= existing_as except: print("no existing availability sets found") cmd = ';'.join([ """az vm availability-set create --name {} --resource-group {} --location {} {} """.format(avs, config["azure_cluster"]["resource_group"], config["azure_cluster"]["azure_location"], subscription) for avs in availability_sets ]) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def delete_az_vm(config, args, vm_name): # TODO try delete with resource delete, if possible, remove this function az_cli_verbose = '--verbose' if args.verbose else '' resource_group = config["azure_cluster"]["resource_group"] delete_cmd = 'az vm delete -g {} -n {} --yes {}'.format( resource_group, vm_name, az_cli_verbose) execute_or_dump_locally(delete_cmd, args.verbose, args.dryrun, args.output)
def remove_nsg_rule_whitelist(config, args, ips): source_address_prefixes = whitelist_source_address_prefixes(config) # Assume ips is a comma separated string if valid if ips is not None and ips != "": ips = ips.split(",") resource_group = config["azure_cluster"]["resource_group"] nsg_name = config["azure_cluster"]["nsg_name"] new_source_address_prefixes = [] for prefix in source_address_prefixes: if prefix not in ips: new_source_address_prefixes.append(prefix) if len(new_source_address_prefixes) == 0: print("Nothing will be left in whitelist, please use delete command!") return cmd = """ az network nsg rule update \ --resource-group %s \ --nsg-name %s \ --name whitelist \ --source-address-prefixes %s """ % (resource_group, nsg_name, " ".join(new_source_address_prefixes)) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_main_nsg(config, args): """ create main nsg, which infra and worker nodes follow """ # create service tag to allow corpnet machines main_nsg_name = config["azure_cluster"]["nsg_name"] resource_group = config["azure_cluster"]["resource_group"] dev_ports = config["cloud_config_nsg_rules"]["corpnet_dev_ports"] user_ports = config["cloud_config_nsg_rules"]["corpnet_user_ports"] service_tags = config["cloud_config_nsg_rules"]["service_tags"] cmd = """az network nsg create --resource-group {} --name {}""".format( resource_group, main_nsg_name) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) priority = 1500 # set nsg rules for devs for tag in service_tags: create_nsg_rule(resource_group, main_nsg_name, priority, "Main-Allow-Dev-{}".format(tag), dev_ports, tag, args) priority += 1 # set nsg rules for users priority = 1600 for tag in service_tags: create_nsg_rule(resource_group, main_nsg_name, priority, "Main-Allow-User-{}".format(tag), user_ports, tag, args) priority += 1
def create_group(config, args): subscription = "--subscription \"{}\"".format( config["azure_cluster"] ["subscription"]) if "subscription" in config["azure_cluster"] else "" cmd = """az group create --name {} --location {} {} """.format(config["azure_cluster"]["resource_group"], config["azure_cluster"]["azure_location"], subscription) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def delete_nsg_rule(resource_group, nsg_name, rule_name, args): cmd = """ az network nsg rule delete \ --resource-group {} \ --nsg-name {} \ --name {} """.format(resource_group, nsg_name, rule_name) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def delete_nsg_rule_with_service_tag(resource_group, nsg_name, service_tag, args): cmd = """ az network nsg rule delete \ --resource-group %s \ --nsg-name %s \ --name %s """ % (resource_group, nsg_name, "allow_%s" % service_tag) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_logging_container(config, args): container_name = gen_logging_container_name(config) connection_string = get_connection_string(config, args) cmd = """ az storage container create \ --name %s \ --connection-string '%s' """ % (container_name, connection_string) print("Creating logging container %s with connection string %s" % (container_name, connection_string)) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def delete_nsg_rule_whitelist(config, args): resource_group = config["azure_cluster"]["resource_group"] nsg_name = config["azure_cluster"]["nsg_name"] cmd = """ az network nsg rule delete \ --resource-group %s \ --nsg-name %s \ --name whitelist """ % (resource_group, nsg_name) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_vnet(config, args): cmd = """az network vnet create \ --resource-group {} \ --name {} \ --address-prefix {} \ --subnet-name {} \ --subnet-prefix {} """.format(config["azure_cluster"]["resource_group"], config["azure_cluster"]["vnet_name"], config["cloud_config_nsg_rules"]["vnet_range"], config["azure_cluster"]["subnet_name"], config["cloud_config_nsg_rules"]["vnet_range"]) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def delete_logging_storage_account(config, args): storage_account_name = gen_logging_storage_account_name(config) resource_group = config["azure_cluster"]["resource_group"] cmd = """ az storage account delete \ --name %s \ --resource-group %s \ --yes """ % (storage_account_name, resource_group) print("Deleting storage account %s in resource group %s" % (storage_account_name, resource_group)) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_nfs_nsg(config, args): nfs_nsg_name = config["azure_cluster"]["nfs_nsg_name"] resource_group = config["azure_cluster"]["resource_group"] nfs_ports = config["cloud_config_nsg_rules"]["nfs_ports"] nfs_nodes, config = load_node_list_by_role_from_config(config, ["nfs"]) infra_nodes, config = load_node_list_by_role_from_config(config, ["infra"]) if len(set(nfs_nodes) - set(infra_nodes)): cmd = """az network nsg create --resource-group {} --name {}""".format( resource_group, nfs_nsg_name) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) priority = 1700 # set nsg rules for devs, (and samba, since samba machines are all in corpnet) for tag in config["cloud_config_nsg_rules"]["service_tags"]: create_nsg_rule(resource_group, nfs_nsg_name, priority, "NFS-Allow-Dev-{}".format(tag), nfs_ports, tag, args) priority += 1
def create_logging_storage_account(config, args): storage_account_name = gen_logging_storage_account_name(config) resource_group = config["azure_cluster"]["resource_group"] location = config["azure_cluster"]["azure_location"] cmd = """ az storage account create \ --name %s \ --resource-group %s \ --access-tier Hot \ --kind StorageV2 \ --sku Standard_RAGRS \ --location %s """ % (storage_account_name, resource_group, location) print("Creating storage account %s in resource group %s" % (storage_account_name, resource_group)) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_nsg_rule_with_service_tag(resource_group, nsg_name, priority, port_ranges, service_tag, args, protocol="tcp"): cmd = """ az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name %s \ --protocol %s \ --priority %s \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (resource_group, nsg_name, "allow_%s" % service_tag, protocol, priority, port_ranges, service_tag) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def create_nfs_nsg(config, args): assert "source_addresses_prefixes" in config["cloud_config_nsg_rules"][ "dev_network"], "Please \ setup source_addresses_prefixes in config.yaml, otherwise, your cluster cannot be accessed" source_addresses_prefixes = config["cloud_config_nsg_rules"][ "dev_network"]["source_addresses_prefixes"] if int(config["azure_cluster"]["nfs_node_num"]) > 0: cmd = """az network nsg create \ --resource-group %s \ --name %s """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nfs_nsg_name"]) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) merged_ip = keep_widest_subnet( config["cloud_config_nsg_rules"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes) cmd = """az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name allow_ssh\ --priority 1200 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % ( config["azure_cluster"]["resource_group"], config["azure_cluster"]["nfs_nsg_name"], config["cloud_config_nsg_rules"]["nfs_ssh"]["port"], " ".join(merged_ip), ) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) for i, service_tag in enumerate(config["cloud_config_nsg_rules"].get( "service_tags", [])): create_nsg_rule_with_service_tag( config["azure_cluster"]["resource_group"], config["azure_cluster"]["nfs_nsg_name"], 1300 + i, config["cloud_config_nsg_rules"].get("tcp_port_ranges", "\'*\'"), service_tag, args)
def add_nsg_rule_whitelist(config, args, ips): # Replicating dev_network access for whitelisting users source_address_prefixes = whitelist_source_address_prefixes(config) if len(source_address_prefixes) == 0: dev_network = config["cloud_config_nsg_rules"]["dev_network"] source_address_prefixes = dev_network.get("source_addresses_prefixes") if source_address_prefixes is None: print("Please setup source_addresses_prefixes in config.yaml") exit() if isinstance(source_address_prefixes, str): source_address_prefixes = source_address_prefixes.split(" ") # Assume ips is a comma separated string if valid if ips is not None and ips != "": source_address_prefixes += ips.split(",") # Safe guard against overlapping IP range source_address_prefixes = keep_widest_subnet(source_address_prefixes) source_address_prefixes = " ".join(list(set(source_address_prefixes))) resource_group = config["azure_cluster"]["resource_group"] nsg_name = config["azure_cluster"]["nsg_name"] tcp_port_ranges = config["cloud_config_nsg_rules"]["tcp_port_ranges"] cmd = """ az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name whitelist \ --protocol tcp \ --priority 1005 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (resource_group, nsg_name, tcp_port_ranges, source_address_prefixes) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def get_connection_string(config, args): storage_account_name = gen_logging_storage_account_name(config) resource_group = config["azure_cluster"]["resource_group"] cmd = """ az storage account show-connection-string \ --name %s \ --resource-group %s \ --query 'connectionString' \ --output tsv """ % (storage_account_name, resource_group) connection_string = \ execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) return connection_string.strip("\n")
def run_kubectl(config, args, commands, need_output=False, dump_to_file=''): if not os.path.exists("./deploy/bin/kubectl"): print( "please make sure ./deploy/bin/kubectl exists. One way is to use ./ctl.py download" ) exit(-1) one_command = " ".join(commands) nodes, _ = load_node_list_by_role_from_config(config, ["infra"], False) master_node = random.choice(nodes) kube_command = "./deploy/bin/kubectl --server=https://{}:{} --certificate-authority={} --client-key={} --client-certificate={} {}".format( config["machines"][master_node]["fqdns"], config["k8sAPIport"], "./deploy/ssl/ca/ca.pem", "./deploy/ssl/kubelet/apiserver-key.pem", "./deploy/ssl/kubelet/apiserver.pem", one_command) if need_output: # we may want to dump command to another file instead of args.output, when we don't want to mix k8s commands with others output = utils.execute_or_dump_locally(kube_command, args.verbose, args.dryrun, dump_to_file) if not args.verbose: print(output) return output else: os.system(kube_command)
def vm_interconnects(config, args): with open(STATUS_YAML) as f: vminfo = yaml.safe_load(f) ip_list, infra_ip_list = [], [] for name, onevm in vminfo["machines"].items(): ip_list.append(onevm["public_ip"] + "/32") if 'infra' in onevm['role']: infra_ip_list.append(onevm["public_ip"] + "/32") allowed_incoming_ips = " ".join(ip_list) cmd = """ az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name tcpinterconnect \ --protocol tcp \ --priority 850 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"], config["cloud_config_nsg_rules"]["inter_connect"] ["tcp_port_ranges"], allowed_incoming_ips) allowed_incoming_infra_ips = " ".join(infra_ip_list) cmd += """ ; az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name nfs_allow_master \ --protocol tcp \ --priority 1400 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nfs_nsg_name"], config["cloud_config_nsg_rules"]["nfs_allow_master"] ["tcp_port_ranges"], allowed_incoming_infra_ips) restricted_source_address_prefixes = "'*'" if "restricted_source_address_prefixes" in config[ "cloud_config_nsg_rules"]: restricted_source_address_prefixes = config["cloud_config_nsg_rules"][ "restricted_source_address_prefixes"] if isinstance(restricted_source_address_prefixes, list): restricted_source_address_prefixes = " ".join( keep_widest_subnet( infra_ip_list + list(set(restricted_source_address_prefixes)))) cmd += """ ; az network nsg rule update \ --resource-group %s \ --nsg-name %s \ --name allowalltcp \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"], restricted_source_address_prefixes) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)
def add_machine(vmname, spec, verbose, dryrun, output_file): multual_exclusive_roles = set( ["infra", "worker", "elasticsearch", "mysqlserver", "lustre"]) mul_ex_role_in_spec = list(set(spec["role"]) & multual_exclusive_roles) assert len( mul_ex_role_in_spec ) <= 1, "We don't allow role overlapping between these roles:{}.".format( ",".join(list(multual_exclusive_roles))) if "pwd" in spec: auth = "--authentication-type password --admin-password '{}' ".format( spec["pwd"]) else: auth = "--generate-ssh-keys --authentication-type ssh --ssh-key-value '{}' ".format( config["azure_cluster"]["sshkey"]) # if just want to update private IP, then keep vmname unchanged, and only update IP. priv_ip = "" if "private_ip" in spec: priv_ip = "--private-ip-address {} ".format(spec["private_ip"]) else: assert (not 'nfs' in spec["role"]), "Must specify IP address for NFS node!" nsg = "nfs_nsg_name" if is_independent_nfs(spec["role"]) else "nsg_name" availability_set = "" if "availability_set" in spec: availability_set = "--availability-set '{}'".format( spec["availability_set"]) elif "worker" in spec["role"] and "availability_set" in config[ "azure_cluster"]: availability_set = "--availability-set '{}'".format( config["azure_cluster"]["availability_set"]) cloud_init = "" # by default, if this is a unique machine, then itself would have a cloud-init file cldinit_appendix = "cloud_init_{}.txt".format(vmname) # we support heterogeneous cluster that has several different types of worker nodes # if later there are differences other than vm_size, we can consider adding a field # called "spec_name" for a spec. as for now, workers are different only in vm_size if "worker" in spec["role"]: cldinit_appendix = "cloud_init_worker_{}.txt".format(spec["vm_size"]) elif len(mul_ex_role_in_spec) == 1 and "lustre" not in spec["role"]: cldinit_appendix = "cloud_init_{}.txt".format(mul_ex_role_in_spec[0]) cloud_init_file = spec.get( "cloud_init_file", 'deploy/cloud-config/{}'.format(cldinit_appendix)) if os.path.exists(cloud_init_file): cloud_init = "--custom-data {}".format(cloud_init_file) # default sku and size by role storage_sku, os_disk_size_gb, data_disk_sizes_gb, disk_id = "", "", "", 0 if "managed_disks" in spec: for st in spec["managed_disks"]: if "is_os" in st and st["is_os"]: assert st["disk_num"] == 1, "Could have only 1 OS disk!" storage_sku += "os={}".format( st.get("sku", config["azure_cluster"]["os_storage_sku"])) os_disk_size_gb = "--os-disk-size-gb " + \ str(st.get("size_gb", config["azure_cluster"]["os_storage_sz"])) elif len(mul_ex_role_in_spec) == 1: storage_sku += " " + " ".join([ "{}={}".format( dsk_id, st.get( "sku", config["azure_cluster"]["vm_local_storage_sku"])) for dsk_id in range(disk_id, disk_id + st["disk_num"]) ]) data_disk_sizes_gb += " " + \ " ".join([str(st.get("size_gb", config["azure_cluster"] ["{}_local_storage_sz".format(mul_ex_role_in_spec[0])]))] * st["disk_num"]) elif "nfs" in spec["role"]: storage_sku += " " + " ".join([ "{}={}".format( dsk_id, st.get("sku", config["azure_cluster"]["nfs_data_disk_sku"])) for dsk_id in range(disk_id, disk_id + st["disk_num"]) ]) data_disk_sizes_gb += " " + \ " ".join([str(st.get("size_gb", config["azure_cluster"] ["nfs_data_disk_sz"]))] * st["disk_num"]) disk_id += st["disk_num"] else: if len(mul_ex_role_in_spec) == 1: data_disk_sizes_gb += " " + \ str(config["azure_cluster"] ["{}_local_storage_sz".format(mul_ex_role_in_spec[0])]) storage_sku = config["azure_cluster"]["vm_local_storage_sku"] if "nfs" in spec["role"]: nfs_dd_sz, nfs_dd_num = config["azure_cluster"][ "nfs_data_disk_sz"], config["azure_cluster"][ "nfs_data_disk_num"] data_disk_sizes_gb += " " + " ".join([str(nfs_dd_sz)] * nfs_dd_num) storage_sku = storage_sku if "infra" in spec["role"] else config[ "azure_cluster"]["nfs_data_disk_sku"] if "vm_size" in spec: vm_size = spec["vm_size"] else: if "infra" in spec["role"]: vm_size = config["azure_cluster"]["{}_vm_size".format( mul_ex_role_in_spec[0])] elif "nfs" in spec["role"]: vm_size = config["azure_cluster"]["nfs_vm_size"] cmd = """ az vm create --resource-group {} \ --name {} \ --tags {} \ --image {} \ {} \ --public-ip-address-dns-name {} \ --location {} \ --size {} \ --vnet-name {} \ --subnet {} \ --nsg {} \ --admin-username {} \ {} \ --storage-sku {}\ {} \ --data-disk-sizes-gb {}\ {} \ {} \ """.format(config["azure_cluster"]["resource_group"], vmname, "role=" + '-'.join(spec["role"]), spec.get("vm_image", config["azure_cluster"]["vm_image"]), priv_ip, vmname, config["azure_cluster"]["azure_location"], vm_size, config["azure_cluster"]["vnet_name"], config["azure_cluster"]["subnet_name"], config["azure_cluster"][nsg], config["cloud_config_nsg_rules"]["default_admin_username"], cloud_init, storage_sku, os_disk_size_gb, data_disk_sizes_gb, auth, availability_set) if "other_params" in spec: for k, v in spec["other_params"]: cmd += " --{} {}".format(k, v) execute_or_dump_locally(cmd, verbose, dryrun, output_file) cmd = ' '.join(cmd.split()) return cmd
def delete_az_resource(config, args, resource_name, resource_type): az_cli_verbose = '--verbose' if args.verbose else '' resource_group = config["azure_cluster"]["resource_group"] delete_cmd = 'az resource delete -g {} -n {} --resource-type {} {}'.format( resource_group, resource_name, resource_type, az_cli_verbose) execute_or_dump_locally(delete_cmd, args.verbose, args.dryrun, args.output)
def create_nsg(config, args): assert "source_addresses_prefixes" in config["cloud_config_nsg_rules"][ "dev_network"], "Please \ setup source_addresses_prefixes in config.yaml, otherwise, your cluster cannot be accessed" source_addresses_prefixes = config["cloud_config_nsg_rules"][ "dev_network"]["source_addresses_prefixes"] if isinstance(source_addresses_prefixes, list): source_addresses_prefixes = " ".join( list(set(source_addresses_prefixes))) restricted_source_address_prefixes = "'*'" if "restricted_source_address_prefixes" in config[ "cloud_config_nsg_rules"]: restricted_source_address_prefixes = config["cloud_config_nsg_rules"][ "restricted_source_address_prefixes"] if isinstance(restricted_source_address_prefixes, list): restricted_source_address_prefixes = " ".join( list(set(restricted_source_address_prefixes))) cmd = """az network nsg create \ --resource-group %s \ --name %s """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"]) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) create_nsg_rules_with_service_tags(config, args) if "tcp_port_ranges" in config["cloud_config_nsg_rules"]: cmd = """az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name allowalltcp \ --protocol tcp \ --priority 1000 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"], config["cloud_config_nsg_rules"]["tcp_port_ranges"], restricted_source_address_prefixes) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) if "udp_port_ranges" in config["cloud_config_nsg_rules"]: cmd = """az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name allowalludp \ --protocol udp \ --priority 1010 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"], config["cloud_config_nsg_rules"]["udp_port_ranges"], restricted_source_address_prefixes) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) cmd = """az network nsg rule create \ --resource-group %s \ --nsg-name %s \ --name allowdevtcp \ --protocol tcp \ --priority 900 \ --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % (config["azure_cluster"]["resource_group"], config["azure_cluster"]["nsg_name"], config["cloud_config_nsg_rules"]["dev_network"] ["tcp_port_ranges"], source_addresses_prefixes) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output)