def _delete_data_bucket(self): logger.info( "Deleting applatix-data bucket contents for cluster %s ...", self._name_id) data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) cluster_name = AXClusterId(name=self._name_id).get_cluster_name() prefix = cluster_name + "/" logger.info( "Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, data_bucket.get_bucket_name()) data_bucket.delete_all(obj_prefix=prefix) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, data_bucket.get_bucket_name())
def __init__(self, containername, customer_image, namespace, version): s = SoftwareInfo() super(ArtifactsContainer, self).__init__( containername, "{}/{}/artifacts:{}".format(s.registry, namespace, version)) # artifacts scratch space self._artifacts_scratch = ContainerVolume( "artifacts-scratch", ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self._artifacts_scratch.set_type("EMPTYDIR") self.add_volume(self._artifacts_scratch) # create a hostpath for docker-socket-dir. This is used to for running docker inspect socket_hostpath = ContainerVolume("docker-socket-file", "/var/run/docker.sock") socket_hostpath.set_type("HOSTPATH", "/var/run/docker.sock") self.add_volume(socket_hostpath) # emptydir for sharing for copying static binaries from init container # so that they are available in the main container self._static_bins = ContainerVolume("static-bins", "/copyto") self._static_bins.set_type("EMPTYDIR") self.add_volume(self._static_bins) # add environment vars needed for artifacts self.add_env("AX_TARGET_CLOUD", value=Cloud().target_cloud()) self.add_env("AX_CLUSTER_NAME_ID", value=AXClusterId().get_cluster_name_id()) self.add_env("AX_CUSTOMER_ID", value=AXCustomerId().get_customer_id()) self.add_env("AX_CUSTOMER_IMAGE_NAME", value=customer_image) self.add_env("AX_ARTIFACTS_SCRATCH", value=ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self.add_env("AX_POD_NAME", value_from="metadata.name") self.add_env("AX_POD_IP", value_from="status.podIP") self.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace") self.add_env("AX_NODE_NAME", value_from="spec.nodeName") self.add_env("ARGO_LOG_BUCKET_NAME", os.getenv("ARGO_LOG_BUCKET_NAME", "")) annotation_vol = ContainerVolume("annotations", "/etc/axspec") annotation_vol.set_type("DOWNWARDAPI", "metadata.annotations") self.add_volume(annotation_vol) # AA-3175: CPU and memory are set to lowest possible so that pod requests are kept at a minimum self.add_resource_constraints("cpu_cores", 0.001) self.add_resource_constraints("mem_mib", 4)
def validate(self): all_errs = [] all_errs += self._validate_critical_directories() if not self.cluster_name: all_errs.append("Please provide cluster name to pause the cluster") if self.cloud_provider not in Cloud.VALID_TARGET_CLOUD_INPUT: all_errs.append( "Cloud provider {} not supported. Please choose from {}". format(self.cloud_provider, Cloud.VALID_TARGET_CLOUD_INPUT)) else: # Cloud singleton should be instantiated during validation stage so # we can ensure customer ID Cloud(target_cloud=self.cloud_provider) return all_errs
def download_kube_key(self): """ Get kube ssh key from S3 and save it in file """ if Cloud().target_cloud_gcp(): return logger.info("Downloading cluster ssh key from s3 ...") data = self._bucket.get_object(self._s3_cluster_ssh_key) assert data is not None, "No kube ssh key at {}/{}".format(self._bucket_name, self._s3_cluster_ssh_key) dir = os.path.dirname(self._key_file) if not os.path.exists(dir): os.makedirs(dir) with open(self._key_file, "w") as f: f.write(data) os.chmod(self._key_file, 0o0600) logger.info("Downloaded kube ssh key from %s/%s to %s", self._bucket_name, self._s3_cluster_ssh_key, self._key_file) return self._key_file
def platform(self, args): from ax.platform.platform import AXPlatform from ax.meta import AXClusterId from ax.platform_client.env import AXEnv Cloud().set_target_cloud(args.target_cloud) assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster" name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() if args.subcommand == 'start': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile, debug=args.debug).start() elif args.subcommand == 'stop': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile).stop() else: logger.error("%sInvalid command '%s'%s", COLOR_RED, COLOR_NORM) sys.exit(1)
def _update_data_bucket(self): data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) if not data_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( data_bucket.get_bucket_name())) if self.cluster_config.get_cluster_provider() != ClusterProvider.USER: # Update CORS config for data bucket too. logger.info("Checking CORS config for %s.", data_bucket.get_bucket_name()) data_bucket.put_cors(DATA_CORS_CONFIG) logger.info("Created %s bucket ... DONE", data_bucket.get_bucket_name())
def __init__(self, cfg): cfg.cluster_size = AXClusterSize.CLUSTER_USER_PROVIDED cfg.cloud_profile = "default" cfg.cluster_type = "standard" cfg.vpc_id = None cfg.vpc_cidr_base = None cfg.subnet_mask_size = None cfg.trusted_cidrs = ClusterInstallDefaults.TRUSTED_CIDR cfg.user_on_demand_nodes = None cfg.spot_instances_option = "none" cfg.cluster_autoscaling_scan_interval = None cfg.support_object_store_name = "" cfg.enable_sandbox = None cfg.software_version_info = None self.cluster_size = cfg.cluster_size if cfg.cloud_provider == "minikube": self.service_manifest_root = "/ax/config/service/argo-wfe" self.platform_bootstrap_config = "/ax/config/service/config/argo-wfe-platform-bootstrap.cfg" Cloud(target_cloud="aws") else: self.service_manifest_root = "/ax/config/service/argo-all" self.platform_bootstrap_config = "/ax/config/service/config/argo-all-platform-bootstrap.cfg" super(PlatformOnlyInstallConfig, self).__init__(cfg) self.install_config = ClusterInstallConfig(cfg=cfg) self.install_config.validate() self.cluster_bucket = cfg.cluster_bucket self.kube_config = cfg.kubeconfig try: self.bucket_endpoint = cfg.endpoint self.access_key = cfg.access_key self.secret_key = cfg.secret_key except Exception as ae: self.bucket_endpoint = None self.access_key = None self.secret_key = None # Overwrite the manifest_root and bootstrap_config. self.install_config.manifest_root = self.service_manifest_root self.install_config.bootstrap_config = self.platform_bootstrap_config return
def __init__(self, cluster_name_id, kube_config=None, key_file=None, metadata=None, aws_profile=None): """ Config file initialization :param cluster_name_id: Cluster name_id in format of name-uuid, lcj-cluster-515d9828-7515-11e6-9b3e-a0999b1b4e15 :param kube_config: kubernetes saved config file. :param key_file: cluster ssh key path :param metadata: path to cluster metadata :param aws_profile: AWS profile to access S3. """ assert AXEnv().is_in_pod() or cluster_name_id, "Must specify cluster name from outside cluster" self._aws_profile = aws_profile self._cluster_name_id = cluster_name_id self._config = AXClusterConfig(cluster_name_id=cluster_name_id, aws_profile=aws_profile) self._kube_config = kube_config if kube_config else self.default_config_path.format(cluster_name_id) self._key_file = key_file if key_file else self.default_key_path.format(cluster_name_id) self._metadata_file = metadata if metadata else self.default_cluster_meta_path config_path = AXClusterConfigPath(name_id=cluster_name_id) self._bucket_name = config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=aws_profile) self._s3_kube_config_key = config_path.kube_config() self._s3_cluster_ssh_key = config_path.kube_ssh() self._s3_cluster_state_before_pause = config_path.state_before_pause() self._s3_cluster_meta = config_path.cluster_metadata() self._s3_cluster_software_info = config_path.versions() self._s3_platform_manifest_dir = config_path.platform_manifest_dir() self._s3_platform_config = config_path.platform_config() self._s3_master_config_prefix = config_path.master_config_dir() self._s3_master_attributes_path = config_path.master_attributes_path() self._s3_master_user_data_path = config_path.master_user_data_path() # For cluster staging info, stage1 and stage2 can be uploaded, downloaded, deleted with AXClusterInfo # stage0 will can only be downloaded with AXClusterInfo. It will be uploaded during cluster information # initialization (i.e. upload cluster id an cluster config), and deleted during cluster information # clean up (i.e. during axinstaller uninstall) self._staging_info = { "stage0": config_path.cluster_install_stage0_key(), "stage1": config_path.cluster_install_stage1_key(), "stage2": config_path.cluster_install_stage2_key() }
def __init__(self, size_in_mb): super(SidecarDockerDaemon, self).__init__(DIND_CONTAINER_NAME, "argoproj/dind:1.12.6") # Add lib modules for dind to load aufs module. libmodule_hostpath = ContainerVolume("kernel-lib-module", "/lib/modules") libmodule_hostpath.set_type("HOSTPATH", "/lib/modules") self.add_volume(libmodule_hostpath) # Add per node dgs to sidecar dgs_vol = ContainerVolume("docker-graph-storage", "/var/lib/docker") if Cloud().own_cloud() == Cloud.CLOUD_AWS: dgs_vol.set_type("DOCKERGRAPHSTORAGE", size_in_mb) else: dgs_vol.set_type("EMPTYDIR") self.add_volume(dgs_vol) # dind daemon needs to be privileged! self.privileged = True
def validate(self): all_errs = [] all_errs += self._validate_critical_directories() # Because we have strict validation during installation, so we can assume # cluster has a valid name and cluster config if not self.cluster_name: all_errs.append("Please provide cluster name to pause the cluster") if self.cloud_provider not in Cloud.VALID_TARGET_CLOUD_INPUT: all_errs.append( "Cloud provider {} not supported. Please choose from {}". format(self.cloud_provider, Cloud.VALID_TARGET_CLOUD_INPUT)) else: # Cloud singleton should be instantiated during validation stage so # we can ensure customer ID Cloud(target_cloud=self.cloud_provider) all_errs += validate_software_info(self.target_software_info) return all_errs
def install_argo_only(self, args): logger.info("Installing Argo platform ...") try: assert args.cluster_name except Exception: print("--cluster-name needs to be specified") sys.exit(1) if args.cloud_provider == "minikube" and not args.bucket_endpoint: Cloud(target_cloud="aws") args.cluster_bucket = "argo" # TODO:revisit # access key and secret is required by code in aws_s3 # use dummy access key and secret for s3proxy args.access_key = "fake-access-key" args.secret_key = "fake-secret-key" self._install_s3_proxy(args.kubeconfig) args.bucket_endpoint = self._get_s3_proxy_endpoint(args.kubeconfig) # Create bucket self._create_s3_proxy_bucket(args.bucket_endpoint, args.cluster_bucket) elif args.cloud_provider == "aws": assert args.cluster_bucket, "--cluster-bucket is required" assert args.cloud_region, "--cloud-region is required" elif args.cloud_provider == "gke": assert args.cluster_bucket, "--cluster-bucket is required" logger.info("s3 bucket endpoint: %s", args.bucket_endpoint) os.environ["AX_CUSTOMER_ID"] = "user-customer-id" os.environ["ARGO_LOG_BUCKET_NAME"] = args.cluster_bucket os.environ["ARGO_DATA_BUCKET_NAME"] = args.cluster_bucket os.environ["ARGO_KUBE_CONFIG_PATH"] = args.kubeconfig os.environ["AX_TARGET_CLOUD"] = Cloud.CLOUD_AWS self._set_env_if_present(args) platform_install_config = PlatformOnlyInstallConfig(cfg=args) PlatformOnlyInstaller(platform_install_config).run() return
def _upgrade_kube(self): """ This function calls our script to upgrade Kubernetes and cluster nodes :return: """ env = { "CLUSTER_NAME_ID": self._name_id, "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(), "OLD_KUBE_VERSION": self._current_software_info.kube_version, "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version, "NEW_CLUSTER_INSTALL_VERSION": self._cfg.target_software_info.kube_installer_version, "ARGO_AWS_REGION": self._cluster_config.get_region(), "AX_TARGET_CLOUD": Cloud().target_cloud() } if self._cfg.cloud_profile: env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile logger.info("Upgrading Kubernetes with environments %s", pformat(env)) env.update(os.environ) subprocess.check_call(["upgrade-kubernetes"], env=env)
def _get_bucket_region_from_aws(self): # We assume cluster is not access any resource outside partition, e.g. # clusters in partition "aws" will not access resource in partition "aws-us-gov" instance_region = Cloud().meta_data().get_region() s3 = boto3.Session( profile_name=self._aws_profile, region_name=instance_region ).client("s3", config=Config(signature_version='s3v4')) logger.debug("Finding region for bucket %s from with initial region %s", self._name, instance_region) try: response = s3.head_bucket(Bucket=self._name) logger.debug("Head_bucket returned OK %s", response) except ClientError as e: if "Not Found" in str(e): return None response = getattr(e, "response", {}) logger.debug("Head_bucket returned error %s, inspecting headers", response) headers = response.get("ResponseMetadata", {}).get("HTTPHeaders", {}) region = headers.get("x-amz-bucket-region", headers.get("x-amz-region", None)) logger.debug("Found region %s from head_bucket for %s, headers %s", region, self._name, headers) return region
import traceback from greenlet import greenlet for ob in gc.get_objects(): if not isinstance(ob, greenlet): continue if not ob: continue logger.debug(''.join(traceback.format_stack(ob.gr_frame))) if __name__ == "__main__": """ Main entry point for AXmon. """ parser = argparse.ArgumentParser(description='AXMon') parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) parser.add_argument('--port', type=int, default=AXMON_DEFAULT_PORT, help="Run server on the specified port") args = parser.parse_args() # Basic logging. logging.basicConfig(format="%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s") logging.getLogger("ax").setLevel(logging.DEBUG) logging.getLogger("botocore").setLevel(logging.WARNING) logging.getLogger("boto3").setLevel(logging.WARNING) Cloud().set_target_cloud(Cloud().own_cloud()) signal.signal(signal.SIGUSR1, debug) axmon_rest_start(port=args.port) AXMon().run()
def wait_for_container(jobname, podname, containername, artifact_scratch_path, out_label): # start the waiter but it is possible that the event has passed so # poll the status once after the waiter is registered and then go # to sleep if the container is still running. global dind_container_id def get_container_status(s): c_status = s.get("containerStatuses", None) main_container_status = None dind_container_status = None docker_ids = {} for c in c_status or []: name = c.get("name", None) if not name: continue if name == containername: main_container_status = c elif name == DIND_CONTAINER_NAME: dind_container_status = c cid = c.get("containerID", None) if cid: l = len("docker://") docker_id = cid[l:] logger.debug("Docker ID for {} is {}".format(name, docker_id)) docker_ids[name] = docker_id return main_container_status, dind_container_status, docker_ids def check_pod_status(pod_status): status = pod_status.status assert isinstance(status, swagger_client.V1PodStatus ), "Expect to see an object of type V1PodStatus" status_dict = swagger_client.ApiClient().sanitize_for_serialization( status) logger.debug("status_dict=%s", status_dict) main_container_status, dind_container_status, docker_ids = get_container_status( status_dict) if main_container_status is None: if status_dict.get("phase", None) == "Pending": logger.debug("Pod still in pending state") return False else: logger.error("bad input %s", status_dict) logger.error( "Could not find container %s in containerStatuses array", containername) return False try: x = main_container_status["state"]["terminated"] logger.debug("Current terminated state object is %s", x) k8s_info = {"container_status": {}} try: k8s_info["pod_ip"] = status.pod_ip k8s_info["host_ip"] = status.host_ip k8s_info["start_time"] = status.start_time except Exception: pass if x is not None: try: k8s_info["container_status"][containername] = x except Exception: pass try: k8s_info["container_status"][ DIND_CONTAINER_NAME] = dind_container_status["state"][ "terminated"] except Exception: pass assert docker_ids, "docker_id should be valid when container terminates" with open("/docker_id.txt", "w") as f: f.write(json.dumps(docker_ids)) with open("/k8s_info.txt", "w") as f: f.write(json.dumps(k8s_info)) if DIND_CONTAINER_NAME in docker_ids: global dind_container_id dind_container_id = docker_ids[DIND_CONTAINER_NAME] return True else: return False except KeyError as ke: logger.debug( "Expected state of terminated state not observed. Got KerError %s", ke) return False logger.info("jobname=%s podname=%s containername=%s", jobname, podname, containername) node_instance_id = "user-node" try: node_instance_id = Cloud().meta_data().get_instance_id() except Exception: pass logger.info("Using node instance id %s, namespace %s", node_instance_id, NAMESPACE) try: kubelet_cli = KubeletClient() except Exception as e: host_ip = get_host_ip() kubelet_cli = KubeletClient(host_ip) # have to match with conainer_outer_executor.py container_done_flag_postfix = "_ax_container_done_flag" poll_container_done_flag_file = "{}/{}/{}".format( artifact_scratch_path, out_label, container_done_flag_postfix) service_instance_id = None check_file_round = 60 * 2 count = 0 posted_event = False while True: try: while True: count += 1 # Kubelet client returns an iterator so we make it a list. As in a certain namespace, pod name # is unique, it's safe to always get pods[0] pods = [ p for p in kubelet_cli.list_namespaced_pods( namespace=NAMESPACE, name=podname) ] pod_status = pods[0] assert isinstance(pod_status, swagger_client.V1Pod ), "Expect to see an object of type V1Pod" assert pod_status.metadata.name == podname # both containers are created so we can assume we have all the knowledge we need for posting URL if not posted_event: try: if not jobname.startswith('axworkflowexecutor'): service_instance_id = post_update_to_axevent( jobname, podname, containername, pod_status, node_instance_id) start_log_collectors(pod_name=podname, pod_status=pod_status) posted_event = True except Exception as e: logger.exception( "Could not post start event due to %s. Will retry later", e) time.sleep(1) if count % 10 != 0: continue done = check_pod_status(pod_status) logger.debug("Container %s in [%s][%s] done=%s", containername, jobname, podname, done) if done: if not posted_event: try: service_instance_id = post_update_to_axevent( jobname, podname, containername, pod_status, node_instance_id) start_log_collectors(pod_name=podname, pod_status=pod_status) except Exception as e: logger.exception( "Could not post start event due to %s.", e) # stop the dind container if dind_container_id: exit_code = subprocess.call([ "{}/docker".format( ARTIFACTS_CONTAINER_SCRATCH_PATH), "kill", "-s", "INT", dind_container_id ]) # TODO: Do docker inspect in a loop and make sure that container dies with clean exit code. # TODO: If exit code is non-zero then ask WFE to ensure that it needs to kill the job controller # TODO: before this pod is terminated logger.debug( "Exit code of stopping dind container is {}". format(exit_code)) # request axmon to delete volume # sidecar still has this code for backward compatibility for tasks that were started # before docker graph storage used per node vol try: release_volume_for_dind(service_instance_id) except Exception: logger.exception("cannot release_volume_for_dind") return for _ in range(1, check_file_round): if os.path.exists(poll_container_done_flag_file): logger.debug("Container %s in [%s][%s] has %s", containername, jobname, podname, poll_container_done_flag_file) # sleep 1 second to let container status propogate time.sleep(1) break else: time.sleep(2) else: # after x min logger.debug("No %s yet, check status again", poll_container_done_flag_file) except requests.exceptions.HTTPError as he: if "NOT FOUND" in str(he): logger.exception("Container %s not found, abort", containername) return else: time.sleep(10) except urllib3.exceptions.MaxRetryError: logger.exception("Sleep 10 seconds and retry") time.sleep(10) except Exception as e: logger.exception("Container %s in [%s][%s]. Exception type: %s", containername, jobname, podname, type(e)) time.sleep(10)
def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] aws_region = os.environ.get("AX_AWS_REGION", "") if aws_region != "": default_envs.append({"name": "AX_AWS_REGION", "value": aws_region}) if os.getenv("ARGO_S3_ACCESS_KEY_ID", "") != "": # Secrets default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_ID", "secret": "argo-access-key" }) default_envs.append({ "name": "ARGO_S3_ACCESS_KEY_SECRET", "secret": "argo-secret-key" }) default_envs.append({ "name": "ARGO_S3_ENDPOINT", "value": os.getenv("ARGO_S3_ENDPOINT", None) }) # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src elif d.get("secret", None): secret = V1SecretKeySelector() secret.key = d["secret"] secret.name = d["secret"] src = V1EnvVarSource() src.secret_key_ref = secret var.value_from = src else: var.value = d["value"] rst.append(var) return rst
def update(self, iam): """ Create all buckets in portal account. """ logger.info( "Creating applatix-support and applatix-upgrade buckets ...") support_bucket = Cloud().get_bucket( AXSupportConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) upgrade_bucket = Cloud().get_bucket( AXUpgradeConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) # Retry create while bucket is created is fine if not support_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( support_bucket.get_bucket_name())) # If policy is already there, we don't update if not support_bucket.get_policy(): logger.info( "Argo support bucket policy does not exist, creating new one..." ) if not support_bucket.put_policy( policy=self._generate_bucket_policy_string( template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=support_bucket.get_bucket_name(), iam=iam)): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format( support_bucket.get_bucket_name())) if not upgrade_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( support_bucket.get_bucket_name())) if not upgrade_bucket.get_policy(): logger.info( "Argo upgrade bucket policy does not exist, creating new one..." ) if not upgrade_bucket.put_policy( policy=self._generate_bucket_policy_string( template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=upgrade_bucket.get_bucket_name(), iam=iam)): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format( support_bucket.get_bucket_name())) # Tag them right away to avoid race deletion. upgrade_bucket.put_object( key=AXUpgradeConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") support_bucket.put_object( key=AXSupportConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") logger.info("Created %s and %s buckets ... DONE", support_bucket.get_bucket_name(), upgrade_bucket.get_bucket_name())
def __init__(self, cluster_name_id=None, aws_profile=None, config=None): self._cluster_name_id = AXClusterId(name=cluster_name_id, aws_profile=aws_profile).get_cluster_name_id() self._bucket_name = AXClusterConfigPath(self._cluster_name_id).bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=aws_profile) self._cluster_config_key = AXClusterConfigPath(self._cluster_name_id).cluster_config() self._conf = config
def __new__(cls, *args, **kwargs): if Cloud().target_cloud_gcp(): from .gke_platform import AXGKEPlatform return super(AXPlatform, cls).__new__(AXGKEPlatform) else: return super(AXPlatform, cls).__new__(cls)
def _generate_default_envs(self, is_daemon, resource_updated): """ Add essential variables to all system containers :param is_daemon: :return: """ default_envs = [ # Kubernetes downward APIs { "name": "AX_NODE_NAME", "path": "spec.nodeName" }, { "name": "AX_POD_NAME", "path": "metadata.name" }, { "name": "AX_POD_NAMESPACE", "path": "metadata.namespace" }, { "name": "AX_POD_IP", "path": "status.podIP" }, # Values { "name": "DISK_MULT", "value": str(self.disk_mult) }, { "name": "AX_TARGET_CLOUD", "value": Cloud().target_cloud() }, { "name": "AX_CLUSTER_NAME_ID", "value": self._cluster_name_id }, { "name": "AX_CUSTOMER_ID", "value": AXCustomerId().get_customer_id() }, ] # Special cases for daemons if is_daemon: if resource_updated: default_envs += [ { "name": "CPU_MULT", "value": str(self.daemon_cpu_mult) }, { "name": "MEM_MULT", "value": str(self.daemon_mem_mult) }, ] else: default_envs += [ { "name": "CPU_MULT", "value": "1.0" }, { "name": "MEM_MULT", "value": "1.0" }, ] else: default_envs += [ { "name": "CPU_MULT", "value": str(self.cpu_mult) }, { "name": "MEM_MULT", "value": str(self.mem_mult) }, ] rst = [] for d in default_envs: var = V1EnvVar() var.name = d["name"] if d.get("path", None): field = V1ObjectFieldSelector() field.field_path = d["path"] src = V1EnvVarSource() src.field_ref = field var.value_from = src else: var.value = d["value"] rst.append(var) return rst
def bucket_exists(self): if self._bucket_exists is None: self._bucket_exists = Cloud().get_bucket(self._bucket_name).exists() return self._bucket_exists
if __name__ == "__main__": parser = argparse.ArgumentParser(description='waiter') parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) _, args = parser.parse_known_args() logging.basicConfig( format= "%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s" ) logging.getLogger("ax").setLevel(logging.DEBUG) logging.getLogger("ax.kubernetes.kubelet").setLevel(logging.INFO) target_cloud = os.environ.get("AX_TARGET_CLOUD", Cloud().own_cloud()) Cloud().set_target_cloud(target_cloud) try: wait_for_container(jobname=args[0], podname=args[1], containername=args[2], artifact_scratch_path=args[3], out_label=args[4]) logger.info( "wait_for_container done. Waiting for log collectors to finish their jobs ..." ) terminate_log_collectors() logger.info("Container waiter quitting ...") except Exception: logger.exception("caught exception")
def _container_to_pod(self, labels): # generate the service environment self._gen_service_env() pod_spec = PodSpec(self.name) pod_spec.restart_policy = "Never" main_container = self._container_spec() for vol_tag, vol in iteritems(self.service.template.inputs.volumes): # sanitize name for kubernetes vol_tag = string_to_dns_label(vol_tag) cvol = ContainerVolume(vol_tag, vol.mount_path) assert "resource_id" in vol.details and "filesystem" in vol.details, "resource_id and filesystem are required fields in volume details" cvol.set_type("AWS_EBS", vol_tag, vol.details["resource_id"], vol.details["filesystem"]) main_container.add_volume(cvol) logger.info("Mounting volume {} {} in {}".format( vol_tag, vol.details, vol.mount_path)) pod_spec.add_main_container(main_container) wait_container = self._generate_wait_container_spec() target_cloud = os.environ.get("AX_TARGET_CLOUD", Cloud().own_cloud()) wait_container.add_env("AX_TARGET_CLOUD", target_cloud) pod_spec.add_wait_container(wait_container) (cpu, mem, d_cpu, d_mem) = self._container_resources() main_container.add_resource_constraints("cpu_cores", cpu, limit=None) main_container.add_resource_constraints("mem_mib", mem, limit=mem) # handle artifacts self_sid = None if self.service.service_context: self_sid = self.service.service_context.service_instance_id # TODO: This function calls ax_artifact and needs to be rewritten. Ugly code. artifacts_container = pod_spec.enable_artifacts( self.software_info.image_namespace, self.software_info.image_version, self_sid, self.service.template.to_dict()) artifacts_container.add_env("AX_JOB_NAME", value=self.name) artifacts_container.add_env("AX_TARGET_CLOUD", target_cloud) artifacts_container.add_env("ARGO_LOG_BUCKET_NAME", os.environ.get("ARGO_LOG_BUCKET_NAME")) artifacts_container.add_env("ARGO_DATA_BUCKET_NAME", self._s3_bucket) self._add_optional_envs(artifacts_container) secret_resources = artifacts_container.add_configs_as_vols( self.service.template.get_all_configs(), self.name, self.namespace) self._resources.insert_all(secret_resources) if self.service.template.docker_spec: dind_c = pod_spec.enable_docker( self.service.template.docker_spec.graph_storage_size_mib) dind_c.add_volumes(pod_spec.get_artifact_vols()) dind_c.add_resource_constraints("cpu_cores", d_cpu, limit=None) dind_c.add_resource_constraints("mem_mib", d_mem, limit=d_mem) service_id = None if self.service.service_context: service_id = self.service.service_context.service_instance_id pod_spec.add_annotation("ax_serviceid", service_id) pod_spec.add_annotation("ax_costid", json.dumps(self.service.costid)) pod_spec.add_annotation("AX_SERVICE_ENV", self._gen_service_env()) for k in labels or []: pod_spec.add_label(k, labels[k]) return pod_spec.get_spec()
def update_cluster_sg(): if Cloud().target_cloud_aws(): update_cluster_sg_aws() elif Cloud().target_cloud_gcp(): pass
def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {}
logger.debug(''.join(traceback.format_stack(ob.gr_frame))) if __name__ == "__main__": """ Main entry point for AXmon. """ parser = argparse.ArgumentParser(description='AXMon') parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) parser.add_argument('--port', type=int, default=AXMON_DEFAULT_PORT, help="Run server on the specified port") args = parser.parse_args() # Basic logging. logging.basicConfig( format= "%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s" ) logging.getLogger("ax").setLevel(logging.DEBUG) logging.getLogger("botocore").setLevel(logging.WARNING) logging.getLogger("boto3").setLevel(logging.WARNING) Cloud().set_target_cloud(os.getenv("AX_TARGET_CLOUD", Cloud().own_cloud())) signal.signal(signal.SIGUSR1, debug) axmon_rest_start(port=args.port) AXMon().run()