def get_keypath() -> str: key = os.getenv(KEY_ENV) if key is None: command.fail("no key specified in env var $HOMEWORLD_DISASTER") if not os.path.isfile(key): command.fail("no key found in file specified by $HOMEWORLD_DISASTER") return key
def download_and_verify_package_list( baseurl: str, dist: str = "homeworld", keyring_resource: str = "homeworld-archive-keyring.gpg" ) -> (str, dict): baseurl = baseurl.rstrip("/") url = baseurl + "/dists/" + dist release = fetch_signed_url(url + "/Release", url + "/Release.gpg", keyring_resource) packages_relpath = "main/binary-amd64/Packages" kvs = parse_apt_kvs(release.decode()) if "SHA256" not in kvs: command.fail("cannot find section for sha256 hashes") hashes_by_path = parse_apt_hash_list(kvs["SHA256"]) if packages_relpath not in hashes_by_path: command.fail("could not find hash for %s" % packages_relpath) packages = fetch_url_and_check_hash(url + "/" + packages_relpath, hashes_by_path[packages_relpath]) parsed_packages = parse_apt_kv_list(packages.decode(), "Package") return baseurl, parsed_packages
def populate() -> None: setup_yaml = os.path.join(get_project(create_dir_if_missing=True), "setup.yaml") if os.path.exists(setup_yaml): command.fail("setup.yaml already exists") resource.copy_to("setup.yaml", setup_yaml) print("filled out setup.yaml")
def export_keytab(node, keytab_file): "decrypt and export the keytab for a particular server" keytab_source = os.path.join(configuration.get_project(), "keytab.%s.crypt" % node) if not os.path.exists(keytab_source): command.fail("no keytab for node %s" % node) keycrypt.gpg_decrypt_file(keytab_source, keytab_file)
def generate() -> None: authorities = get_targz_path(check_exists=False) if os.path.exists(authorities): command.fail("authorities.tgz already exists") # tempfile.TemporaryDirectory() creates the directory with 0o600, which protects the private keys with tempfile.TemporaryDirectory() as d: certdir = os.path.join(d, "certdir") keyserver_yaml = os.path.join(d, "keyserver.yaml") util.writefile(keyserver_yaml, configuration.get_keyserver_yaml().encode()) os.mkdir(certdir) print("generating authorities...") try: # TODO: avoid having these touch disk subprocess.check_call(["keygen", keyserver_yaml, certdir, "supervisor-nodes"]) except FileNotFoundError as e: if e.filename == "keygen": command.fail("could not find keygen binary. is the homeworld-keyserver dependency installed?") else: raise e print("encrypting authorities...") cryptdir = os.path.join(d, "cryptdir") os.mkdir(cryptdir) for filename in os.listdir(certdir): if filename.endswith(".pub") or filename.endswith(".pem"): # public keys; copy over without encryption util.copy(os.path.join(certdir, filename), os.path.join(cryptdir, filename)) else: # private keys; encrypt when copying keycrypt.gpg_encrypt_file(os.path.join(certdir, filename), os.path.join(cryptdir, filename)) subprocess.check_call(["shred", "--"] + os.listdir(certdir), cwd=certdir) print("packing authorities...") subprocess.check_call(["tar", "-C", cryptdir, "-czf", authorities, "."]) subprocess.check_call(["shred", "--"] + os.listdir(cryptdir), cwd=cryptdir)
def setup_keyserver(ops: Operations) -> None: config = configuration.get_config() for node in config.nodes: if node.kind != "supervisor": continue ops.ssh_mkdir("create directories on @HOST", node, AUTHORITY_DIR, STATICS_DIR, CONFIG_DIR) for name, data in authority.iterate_keys_decrypted(): # TODO: keep these keys in memory if "/" in name: command.fail("found key in upload list with invalid filename") # TODO: avoid keeping these keys in memory for this long ops.ssh_upload_bytes("upload authority %s to @HOST" % name, node, data, os.path.join(AUTHORITY_DIR, name)) ops.ssh_upload_bytes("upload cluster config to @HOST", node, configuration.get_cluster_conf().encode(), STATICS_DIR + "/cluster.conf") ops.ssh_upload_bytes("upload machine list to @HOST", node, configuration.get_machine_list_file().encode(), STATICS_DIR + "/machine.list") ops.ssh_upload_bytes("upload keyserver config to @HOST", node, configuration.get_keyserver_yaml().encode(), CONFIG_DIR + "/keyserver.yaml") ops.ssh("enable keyserver on @HOST", node, "systemctl", "enable", "keyserver.service") ops.ssh("start keyserver on @HOST", node, "systemctl", "restart", "keyserver.service")
def boot_install(self, bootstrap_token): self.create_disk() # TODO: do something better than a two-second delay to detect "boot:" prompt bootline = ("install netcfg/get_ipaddress=%s homeworld/asktoken=%s\n" % (self.node.ip, bootstrap_token)).encode() if self.boot_with_io("install", text=bootline, delay=2.0).wait(): command.fail("qemu virtual machine failed")
def check_dns_function(): config = configuration.Config.load_from_project() workers = [node for node in config.nodes if node.kind == "worker"] if len(workers) < 1: command.fail("expected at least one worker node") worker = random.choice(workers) print("trying dns functionality test with", worker) container_command = "nslookup kubernetes.default.svc.hyades.local 172.28.0.2" server_command = [ "rkt", "run", "homeworld.mit.edu/debian", "--exec", "/bin/bash", "--", "-c", setup.escape_shell(container_command) ] results = subprocess.check_output([ "ssh", "root@%s.%s" % (worker.hostname, config.external_domain), "--" ] + server_command) last_line = results.replace(b"\r\n", b"\n").replace(b"\0", b'').strip().split(b"\n")[-1] if not last_line.endswith(b"Address: 172.28.0.1"): command.fail("unexpected last line: %s" % repr(last_line.decode())) print("dns-addon seems to work!")
def access_ssh(no_add_to_agent: bool = False): """ request SSH access to the cluster no_add_to_agent: do not add the resulting ssh key to ssh-agent """ keypath = renew_ssh_cert() print("===== v CERTIFICATE DETAILS v =====") subprocess.check_call(["ssh-keygen", "-L", "-f", keypath + "-cert.pub"]) print("===== ^ CERTIFICATE DETAILS ^ =====") if not no_add_to_agent: # TODO: clear old identities try: ssh_add_output = subprocess.check_output( ["ssh-add", "--", keypath], stderr=subprocess.STDOUT) # if the user is using gnome, gnome-keyring might # masquerade as ssh-agent and provide a zero exit # code despite failing to add the certificate if b"add failed" in ssh_add_output: fail_hint = "do you have an ssh-agent?\n" \ "(gnome-keyring does not count)" command.fail("*** ssh-add failed! ***", fail_hint) except subprocess.CalledProcessError: fail_hint = "ssh-add returned non-zero exit code. do you have an ssh-agent?" command.fail("*** ssh-add failed! ***", fail_hint)
def fetch_url_and_check_hash(url: str, sha256hash: str) -> bytes: data = fetch_url(url) found = util.sha256sum_data(data) if found != sha256hash: command.fail("wrong hash: expected %s but got %s from url %s" % (sha256hash, found, url)) return data
def get_known_hosts_path() -> str: homedir = os.getenv("HOME") if homedir is None: command.fail( "could not determine home directory, so could not find ~/.ssh/known_hosts" ) return os.path.join(homedir, ".ssh", "known_hosts")
def check_flannel(): "verify that the flannel addon is functioning" config = configuration.get_config() node_count = len( [node for node in config.nodes if node.kind != "supervisor"]) expect_prometheus_query_exact( 'sum(kube_daemonset_status_number_ready{daemonset="kube-flannel-ds"})', node_count, "flannel pods are ready") expect_prometheus_query_bool( "sum(flannel_collect_enum_check)", "flannel metrics collector is failing enumeration") expect_prometheus_query_bool( "sum(flannel_collect_enum_dup_check)", "flannel metrics collector is encountering duplication") expect_prometheus_query_exact('sum(flannel_collect_check)', node_count, "flannel metrics monitors are collecting") expect_prometheus_query_exact( 'sum(flannel_duplicate_check)', node_count, "flannel metrics monitors are avoiding duplication") expect_prometheus_query_exact( 'sum(flannel_monitor_check)', node_count, "flannel metrics monitors are monitoring successfully") worst_recency = float( pull_prometheus_query('time() - min(flannel_monitor_recency)')) if worst_recency > 60: command.fail( "flannel metrics monitors have not updated recently enough") expect_prometheus_query_exact('sum(flannel_talk_check)', node_count * node_count, "flannel pings are successful") print("flannel seems to work!")
def edit() -> None: "open $EDITOR (defaults to nano) to edit the project's setup.yaml" setup_yaml = os.path.join(get_project(), "setup.yaml") if not os.path.exists(setup_yaml): command.fail( "setup.yaml does not exist (run spire config populate first?)") subprocess.check_call([get_editor(), "--", setup_yaml])
def expect_prometheus_query_exact(query, expected, description): # description -> 'X are Y' count = int(pull_prometheus_query(query)) if count > expected: command.fail("too many %s" % description) if count < expected: command.fail("only %d/%d %s" % (count, expected, description))
def listen(): try: container_command = "ip -o addr show dev eth0 to 172.18/16 primary && sleep 15" server_command = ["rkt", "run", "--net=rkt.kubernetes.io", "homeworld.mit.edu/debian", "--", "-c", setup.escape_shell(container_command)] cmd = ssh.build_ssh(worker_listener, *server_command) with subprocess.Popen(cmd, stdout=subprocess.PIPE, bufsize=1, universal_newlines=True) as process: stdout = process.stdout.readline() if "scope" not in stdout: command.fail("could not find scope line in ip addr output (%s)" % repr(stdout)) parts = stdout.split(" ") if "inet" not in parts: command.fail("could not find inet address in ip addr output") address = parts[parts.index("inet") + 1] if not address.endswith("/24"): command.fail("expected address that ended in /24, not '%s'" % address) address = address[:-3] if address.count(".") != 3: command.fail("expected valid IPv4 address, not '%s'" % address) if not address.replace(".", "").isdigit(): command.fail("expected valid IPv4 address, not '%s'" % address) found_address[0] = address event.set() process.communicate(timeout=20) finally: event.set() return True
def check_kube_health(): check_kube_init() config = configuration.get_config() kube_node_count = len( [node for node in config.nodes if node.kind != "supervisor"]) master_node_count = len( [node for node in config.nodes if node.kind == "master"]) expect_prometheus_query_exact('sum(kube_node_info)', kube_node_count, "kubernetes nodes are online") hostnames = [ node.hostname for node in config.nodes if node.kind == "master" ] regex_for_master_nodes = "|".join(hostnames) for hostname in hostnames: if not hostname.replace("-", "").isalnum(): command.fail( "invalid hostname for inclusion in prometheus monitoring rules: %s" % hostname) expect_prometheus_query_exact( 'sum(kube_node_spec_unschedulable{node=~"%s"})' % regex_for_master_nodes, master_node_count, "master nodes are unschedulable") expect_prometheus_query_exact('sum(kube_node_spec_unschedulable)', master_node_count, "kubernetes nodes are unschedulable") expect_prometheus_query_exact( 'sum(kube_node_status_condition{condition="Ready",status="true"})', kube_node_count, "kubernetes nodes are ready") NAMESPACES = ["default", "kube-public", "kube-system"] expect_prometheus_query_exact( 'sum(kube_namespace_status_phase{phase="Active",namespace=~"%s"})' % "|".join(NAMESPACES), len(NAMESPACES), "namespaces are set up") print("kubernetes cluster passed cursory inspection!")
def check_user_grant(): "verify that user-grant and its kubeconfigs work" config = configuration.get_config() # because we don't yet have load balancing, we have to somehow get *inside the cluster* to test this. # that means figuring out the IP address for the user-grant service, uploading the local user cert to the master # node, and then authenticating to user-grant via curl on the master node. bluh. # TODO: once load balancing is ready, make this whole thing much simpler # we use a master node so we're confident we aren't connecting to the node where user-grant is hosted. there's # nothing about this that otherwise requires it; usually we'd choose a worker node to avoid running unnecessary code # on the master nodes, but this is entirely for testing in non-production clusters, so it doesn't matter. proxy_node = config.get_any_node("master") service_ip = get_service_ip("user-grant") user_key, user_cert = authority.get_local_grant_user_paths() remote_key, remote_cert = "/etc/homeworld/testing/usergrant.key", "/etc/homeworld/testing/usergrant.pem" ssh.check_ssh(proxy_node, "rm", "-f", remote_key, remote_cert) ssh.check_ssh(proxy_node, "mkdir", "-p", "/etc/homeworld/testing") ssh.check_scp_up(proxy_node, user_key, remote_key) ssh.check_scp_up(proxy_node, user_cert, remote_cert) setup.modify_temporary_dns(proxy_node, {config.user_grant_domain: service_ip}) try: kubeconfig = ssh.check_ssh_output( proxy_node, "curl", "--key", remote_key, "--cert", remote_cert, "https://%s/" % config.user_grant_domain).decode() finally: setup.modify_temporary_dns(proxy_node, {}) magic_phrase = "it allows authenticating to the Hyades cluster as you" if magic_phrase not in kubeconfig: command.fail( "invalid kubeconfig: did not see phrase " + repr(magic_phrase), "kubeconfig received read as follows: " + repr(kubeconfig)) print("successfully retrieved kubeconfig from user-grant!") # at this point, we have a kubeconfig generated by user-grant, and now we want to confirm that it works. # we'll confirm that the kubeconfig works by checking that the auto-created rolebinding passes the sniff test. with tempfile.TemporaryDirectory() as workdir: kubeconfig_path = os.path.join(workdir, "granted-kubeconfig") util.writefile(kubeconfig_path, kubeconfig.encode()) rolebinding = json.loads( subprocess.check_output([ "hyperkube", "kubectl", "--kubeconfig", kubeconfig_path, "-o", "json", "get", "rolebindings", "auto-grant-" + authority.UPSTREAM_USER_NAME ]).decode()) if rolebinding.get("roleRef", {}).get("name") != "admin": command.fail("rolebinding for user was not admin in %s" % repr(rolebinding)) print("autogenerated rolebinding for user", repr(authority.UPSTREAM_USER_NAME), "passed basic check!")
def check_keystatics(): cluster_conf = query.get_keyurl_data("/static/cluster.conf") expected_cluster_conf = configuration.get_cluster_conf() if not compare_multiline(cluster_conf, expected_cluster_conf): command.fail("MISMATCH: cluster.conf") print("pass: keyserver serving correct static files")
def import_keytab(node, keytab_file): "import and encrypt a keytab for a particular server" if not configuration.get_config().has_node(node): command.fail("no such node: %s" % node) keytab_target = os.path.join(configuration.get_project(), "keytab.%s.crypt" % node) keycrypt.gpg_encrypt_file(keytab_file, keytab_target)
def get_project() -> str: project_dir = os.getenv("HOMEWORLD_DIR") if project_dir is None: command.fail("no HOMEWORLD_DIR environment variable declared") if not os.path.isdir(project_dir): command.fail("HOMEWORLD_DIR (%s) is not a directory that exists" % project_dir) return project_dir
def hosts_up(hosts): for host, ip in hosts.items(): if "\t" in host: command.fail("expected no tabs in hostname %s" % repr(host)) assert "\t" not in str(ip) sudo_append_to_file( "/etc/hosts", ["%s\t%s" % (ip, hostname) for hostname, ip in hosts.items()])
def populate() -> None: "initialize the cluster's setup.yaml with the template" setup_yaml = os.path.join(get_project(create_dir_if_missing=True), "setup.yaml") if os.path.exists(setup_yaml): command.fail("setup.yaml already exists") resource.extract("//spire/resources:setup.yaml", setup_yaml) print("filled out setup.yaml")
def get_apiserver_default() -> str: # TODO: this should be eliminated, because nothing should be specific to this one apiserver config = Config.load_from_project() apiservers = [node for node in config.nodes if node.kind == "master"] if not apiservers: command.fail( "no apiserver to select, because no master nodes were configured") return "https://%s:443" % apiservers[0].ip
def get_service_ip(service_name: str) -> str: clusterIP = access.call_kubectl([ "get", "service", "-o=jsonpath={.spec.clusterIP}", "--", service_name ], return_result=True).decode().strip() if clusterIP.count(".") != 3 or not clusterIP.replace(".", "").isdigit(): command.fail("invalid clusterIP for %s service: %s" % (service_name, repr(clusterIP))) return clusterIP
def ssh_foreach(ops: setup.Operations, node_kind: str, *params: str): config = configuration.get_config() valid_node_kinds = configuration.Node.VALID_NODE_KINDS if not (node_kind == "node" or node_kind in valid_node_kinds): command.fail("usage: spire foreach {node," + ",".join(valid_node_kinds) + "} command") for node in config.nodes: if node_kind == "node" or node.kind == node_kind: ops.ssh("run command on @HOST", node, *params)
def export_https(name, keyout, certout): if name != setup.REGISTRY_HOSTNAME: command.fail("unexpected https host: %s" % name) keypath = os.path.join(configuration.get_project(), "https.%s.key.crypt" % name) certpath = os.path.join(configuration.get_project(), "https.%s.pem" % name) keycrypt.gpg_decrypt_file(keypath, keyout) util.copy(certpath, certout)
def fetch_signed_url(url: str, signature_url: str) -> bytes: signature = fetch_url(signature_url) data = fetch_url(url) keyring = resource.get("//upload:keyring.gpg") if not verify_gpg_signature(data, signature, keyring): command.fail("signature verification FAILED on %s!" % url) return data
def check_pem_type(filepath, expect): with open(filepath, "r") as f: first_line = f.readline() if not first_line.startswith( "-----BEGIN ") or not first_line.rstrip().endswith("-----"): command.fail("not a PEM file: %s" % filepath) pem_header_type = first_line.rstrip()[len("-----BEGIN "):-len("-----")] if pem_header_type != expect: command.fail("incorrect PEM header: expected %s, not %s" % (expect, pem_header_type))
def parse_apt_hash_list(section): hashes_by_path = {} for line in section.split("\n"): if line.count(" ") != 2: command.fail("found incorrectly formatted sha256 section") hashed, _, path = line.split(" ") hashes_by_path[path] = hashed return hashes_by_path
def get_project(create_dir_if_missing=False) -> str: project_dir = os.getenv("HOMEWORLD_DIR") if project_dir is None: command.fail("no HOMEWORLD_DIR environment variable declared") if not os.path.isdir(project_dir): if create_dir_if_missing: os.mkdir(project_dir) else: command.fail("HOMEWORLD_DIR (%s) is not a directory that exists" % project_dir) return project_dir