def _delete_cluster_bucket(self): logger.info("Deleting applatix-cluster bucket contents for cluster %s ...", self._name_id) cluster_bucket = Cloud().get_bucket(AXClusterConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) idobj = AXClusterId(name=self._name_id) cluster_config_path = AXClusterConfigPath(name_id=self._name_id) cluster_name = idobj.get_cluster_name() prefix = cluster_name + "/" # TODO: Not idempotent here. # Consider the following case: if there is exception thrown when deleting S3 objects, install stage 1 # information has already been deleted but not everything are successfully deleted, the next time user # executes "delete", this program will assume install stage 1 has been cleaned up. exempt = [idobj.get_cluster_id_s3_key(), cluster_config_path.cluster_install_stage0_key()] logger.info("Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, cluster_bucket.get_bucket_name()) cluster_bucket.delete_all(obj_prefix=prefix, exempt=exempt) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, cluster_bucket.get_bucket_name()) logger.info("Deleting stage0 information ...") for item in exempt: cluster_bucket.delete_object(item) logger.info("Deleting stage0 information ... DONE")
def __init__(self, cluster_name, cluster_id=None, cloud_profile=None, generate_name_id=False, dry_run=True): if cluster_id: input_name = "{}-{}".format(cluster_name, cluster_id) else: input_name = cluster_name self._idobj = AXClusterId(name=input_name, aws_profile=cloud_profile) if generate_name_id: # This is used during installation to pre-generate cluster name id record try: self._idobj.get_cluster_name_id() except Exception as e: logger.info( "Cannot find cluster name id: %s. Cluster is not yet created.", e) self._idobj.create_cluster_name_id() self._csm = ClusterStateMachine( cluster_name_id=self._idobj.get_cluster_name_id(), cloud_profile=cloud_profile) self._dry_run = dry_run
def cluster(self, args): from ax.platform.ax_cluster_info import AXClusterInfo from ax.meta import AXClusterId from ax.platform_client.env import AXEnv Cloud().set_target_cloud(args.target_cloud) assert AXEnv().is_in_pod( ) or args.cluster_name, "Must specify cluster name from outside cluster" if args.subcommand in ['start', 'create']: logger.error("=" * 80) logger.error( "axtool cluster start/create has be moved to axinstaller") logger.error("=" * 80) sys.exit(1) elif args.subcommand in ['stop', 'delete']: logger.error("=" * 80) logger.error( "axtool cluster stop/delete has be moved to axinstaller") logger.error("=" * 80) sys.exit(1) elif args.subcommand == 'show': import subprocess name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() AXClusterInfo(name_id, aws_profile=args.aws_profile).download_kube_key() conf_file = AXClusterInfo( name_id, aws_profile=args.aws_profile).download_kube_config() logger.info("Kubeconfig") with open(conf_file, "r") as f: conf = f.read() logger.info("%s", conf) subprocess.call( ["kubectl", "--kubeconfig", conf_file, "cluster-info"]) subprocess.call( ["kubectl", "--kubeconfig", conf_file, "get", "no"]) subprocess.call([ "kubectl", "--kubeconfig", conf_file, "--namespace", "axsys", "get", "po" ]) elif args.subcommand == 'download-config': name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() if Cloud().target_cloud_aws(): AXClusterInfo( name_id, aws_profile=args.aws_profile).download_kube_key() AXClusterInfo(name_id, aws_profile=args.aws_profile).download_kube_config()
def get_log_urls_for_container(pstat, podname, containername, instance_id): assert pstat.metadata.self_link, "Pod status does not have self_link" url_run = "{}/log?container={}".format(pstat.metadata.self_link, containername) cstats = pstat.status.container_statuses docker_id = None for cstat in cstats: if cstat.name != containername: continue if cstat.container_id is None: # Running: The pod has been bound to a node, and all of the containers have been created. # At least one container is still running, or is in the process of starting or restarting. raise AXPlatformException( "log urls can only be obtained after pod {} has started. Current status of container is {}" .format(podname, cstat)) docker_id = cstat.container_id[len("docker://"):] assert docker_id is not None, "Docker ID of created container {} in pod {} was not found".format( containername, podname) name_id = AXClusterId().get_cluster_name_id() bucket = AXClusterDataPath(name_id).bucket() prefix = AXClusterDataPath(name_id).artifact() url_done = "/{}/{}/{}/{}.{}.log".format(bucket, prefix, instance_id, containername, docker_id) return url_run, url_done
def __init__(self, cluster_name, cluster_id=None, cloud_profile=None): if cluster_id: input_name = "{}-{}".format(cluster_name, cluster_id) else: input_name = cluster_name self._idobj = AXClusterId(name=input_name, aws_profile=cloud_profile)
def visibility_to_elb_name(visibility): name_id = AXClusterId().get_cluster_name_id() if visibility == ExternalRouteVisibility.VISIBILITY_ORGANIZATION: elb_name = ManagedElb.get_elb_name(name_id, "ing-pri") else: elb_name = ManagedElb.get_elb_name(name_id, "ing-pub") return elb_name
def __init__(self, config_file_path): assert os.path.isfile( config_file_path), "Config file {} is not a file".format( config_file_path) self._config_file = config_file_path self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id) if not self._cluster_config.get_cluster_provider().is_user_cluster(): self.cpu_mult, self.mem_mult, self.disk_mult, \ self.daemon_cpu_mult, self.daemon_mem_mult = self._get_resource_multipliers() else: self.cpu_mult = 1 self.mem_mult = 1 self.disk_mult = 1 self.daemon_cpu_mult = 1 self.daemon_mem_mult = 1 self._swagger_components = [] self._yaml_components = [] self._updated_raw = "" # TODO: when we support config software info using a config file, need to figure out how that # file gets passed through, since SoftwareInfo is not a singleton self._software_info = SoftwareInfo() self._load_objects() self._load_raw()
def __init__(self): self._cluster_name_id = AXClusterId().get_cluster_name_id() self._cluster_name = AXClusterId().get_cluster_name() self._cluster_id = AXClusterId().get_cluster_id() self._account = AXCustomerId().get_customer_id() self._sleep_interval = SECONDS_PER_MINUTE self._hourly = SECONDS_PER_HOUR self._daily = SECONDS_PER_DAY self._last_hourly = -self._hourly self._last_daily = -self._daily self._elasticsearch_host = "elasticsearch" logger.debug("AX account: %s cluster_id: %s", self._account, self._cluster_name_id)
def __init__(self, namespace="axuser"): logger.debug("Starting volume manager for {}".format(namespace)) self._namespace = namespace # map of volume pools self._pools = {} existing_pool_list = VolumeManager._get_pools_from_provider(namespace) for poolname, meta in existing_pool_list.iteritems(): meta_dict = json.loads(meta) size = meta_dict["size"] attribs = meta_dict["attributes"] self._pools[poolname] = VolumePool(poolname, namespace, size, attribs) self._cb = Callback() self._cb.add_cb(self._handle_volume_event) self._cb.start() # The following are used for raw EBS volumes. def get_region(): return AWSMetaData().get_region() if AXEnv().is_in_pod( ) else "us-west-2" self.ec2 = boto3.Session().client('ec2', region_name=get_region()) self.cluster_id = AXClusterId().get_cluster_name_id() # This lock is used for synchronizing raw ebs volume creations self.raw_disk_lock = Lock()
def get_log_urls(self, service_instance_id): cname = self.get_main_container_name() url_run = "/api/v1/namespaces/{}/pods/{}/log?container={}&follow=true".format( self.namespace, self.name, cname) docker_id = None pod = self._get_status_obj() cstats = pod.status.container_statuses for cstat in cstats: if cstat.name != cname: continue if cstat.container_id is None: # Running: The pod has been bound to a node, and all of the containers have been created. # At least one container is still running, or is in the process of starting or restarting. raise AXPlatformException( "log urls can only be obtained after pod {} has started. Current status of container is {}" .format(self.name, cstat)) docker_id = cstat.container_id[len("docker://"):] assert docker_id is not None, "Docker ID of created container {} in pod {} was not found".format( self.name, cname) name_id = AXClusterId().get_cluster_name_id() bucket = AXClusterDataPath(name_id).bucket() prefix = AXClusterDataPath(name_id).artifact() url_done = "/{}/{}/{}/{}.{}.log".format(bucket, prefix, service_instance_id, cname, docker_id) return url_run, url_done
def _set_s3(self): """ Set bucket, log_s3_prefix, s3_processor """ logger.info("Setting up s3 ...") cluster_name_id = AXClusterId().get_cluster_name_id() self._bucket_name = AXClusterDataPath(cluster_name_id).bucket() self._bucket = Cloud().get_bucket(self._bucket_name) artifact_prefix = AXClusterDataPath(cluster_name_id).artifact() self._log_s3_prefix = artifact_prefix self._bucket_ax_is_external = AXLogPath(cluster_name_id).is_external() self._bucket_name_ax = AXLogPath(cluster_name_id).bucket() self._bucket_ax = Cloud().get_bucket(self._bucket_name_ax) artifact_prefix_ax = AXLogPath(cluster_name_id).artifact() self._log_s3_prefix_ax = artifact_prefix_ax assert self._bucket.exists(), "S3 bucket {} DOES NOT exist".format( self._bucket_name) assert self._bucket_ax.exists(), "S3 bucket {} DOES NOT exist".format( self._bucket_name_ax) logger.info("Using S3 bucket %s, with log prefix %s", self._bucket.get_bucket_name(), self._log_s3_prefix) logger.info("Using S3 bucket %s, with log prefix %s for AX", self._bucket_ax.get_bucket_name(), self._log_s3_prefix_ax)
def __init__(self): self.name_id = AXClusterId().get_cluster_name_id() paths = AXClusterConfigPath(name_id=self.name_id) self.bucket = paths.bucket() self.terraform_dir = paths.terraform_dir() self.region = AXClusterConfig().get_region() self.placement = AXClusterConfig().get_zone() self.trusted_cidrs = AXClusterConfig().get_trusted_cidr() self.s3 = AXS3Bucket(bucket_name=self.bucket)
def __init__(self, cluster_name_id=None, aws_profile=None, config=None): self._cluster_name_id = AXClusterId( name=cluster_name_id, aws_profile=aws_profile).get_cluster_name_id() self._bucket_name = AXClusterConfigPath(self._cluster_name_id).bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=aws_profile) self._cluster_config_key = AXClusterConfigPath( self._cluster_name_id).cluster_config() self._conf = config
def __init__(self, input_name, cloud_profile): """ :param input_name: cluster name or <cluster_name>-<cluster_id> format :param cloud_profile: """ name_id = AXClusterId(name=input_name, aws_profile=cloud_profile).get_cluster_name_id() self.cluster_config = AXClusterConfig(cluster_name_id=name_id, aws_profile=cloud_profile) self.cluster_info = AXClusterInfo(cluster_name_id=name_id, aws_profile=cloud_profile)
def __init__(self, customer_id, cluster_name_id, aws_profile): self._customer_id = customer_id self._cluster_name_id = cluster_name_id self._cluster_name = AXClusterId(cluster_name_id).get_cluster_name() self._aws_profile = aws_profile cluster_bucket_name = AXClusterConfigPath(cluster_name_id).bucket() self._cluster_bucket = Cloud().get_bucket(cluster_bucket_name, aws_profile=self._aws_profile) support_bucket_name = AXSupportConfigPath(cluster_name_id).bucket() self._support_bucket = Cloud().get_bucket(support_bucket_name, aws_profile=self._aws_profile)
def _delete_data_bucket(self): logger.info("Deleting applatix-data bucket contents for cluster %s ...", self._name_id) data_bucket = Cloud().get_bucket(AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) cluster_name = AXClusterId(name=self._name_id).get_cluster_name() prefix = cluster_name + "/" logger.info("Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, data_bucket.get_bucket_name()) data_bucket.delete_all(obj_prefix=prefix) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, data_bucket.get_bucket_name())
def kubernetes(self, args): from ax.platform.platform import AXPlatform from ax.meta import AXClusterId from ax.platform_client.env import AXEnv assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster" name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() plat = AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile) if args.subcommand == 'create': plat.start_one(args.object_name) elif args.subcommand == 'delete': plat.stop_one(args.object_name)
def __init__(self, containername, customer_image, namespace, version): s = SoftwareInfo() super(ArtifactsContainer, self).__init__( containername, "{}/{}/artifacts:{}".format(s.registry, namespace, version)) # artifacts scratch space self._artifacts_scratch = ContainerVolume( "artifacts-scratch", ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self._artifacts_scratch.set_type("EMPTYDIR") self.add_volume(self._artifacts_scratch) # create a hostpath for docker-socket-dir. This is used to for running docker inspect socket_hostpath = ContainerVolume("docker-socket-file", "/var/run/docker.sock") socket_hostpath.set_type("HOSTPATH", "/var/run/docker.sock") self.add_volume(socket_hostpath) # emptydir for sharing for copying static binaries from init container # so that they are available in the main container self._static_bins = ContainerVolume("static-bins", "/copyto") self._static_bins.set_type("EMPTYDIR") self.add_volume(self._static_bins) # add environment vars needed for artifacts self.add_env("AX_TARGET_CLOUD", value=Cloud().target_cloud()) self.add_env("AX_CLUSTER_NAME_ID", value=AXClusterId().get_cluster_name_id()) self.add_env("AX_CUSTOMER_ID", value=AXCustomerId().get_customer_id()) self.add_env("AX_CUSTOMER_IMAGE_NAME", value=customer_image) self.add_env("AX_ARTIFACTS_SCRATCH", value=ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH) self.add_env("AX_POD_NAME", value_from="metadata.name") self.add_env("AX_POD_IP", value_from="status.podIP") self.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace") self.add_env("AX_NODE_NAME", value_from="spec.nodeName") self.add_env("ARGO_LOG_BUCKET_NAME", os.getenv("ARGO_LOG_BUCKET_NAME", "")) self.add_env("ARGO_DATA_BUCKET_NAME", os.getenv("ARGO_DATA_BUCKET_NAME", "")) annotation_vol = ContainerVolume("annotations", "/etc/axspec") annotation_vol.set_type("DOWNWARDAPI", "metadata.annotations") self.add_volume(annotation_vol) # AA-3175: CPU and memory are set to lowest possible so that pod requests are kept at a minimum self.add_resource_constraints("cpu_cores", 0.001) self.add_resource_constraints("mem_mib", 4)
def __init__(self, name, namespace="axuser"): self.name = name self.namespace = namespace self.client = KubernetesApiClient(use_proxy=True) self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self.software_info = SoftwareInfo() self._resources = AXResources()
def platform(self, args): from ax.platform.platform import AXPlatform from ax.meta import AXClusterId from ax.platform_client.env import AXEnv Cloud().set_target_cloud(args.target_cloud) assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster" name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id() if args.subcommand == 'start': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile, debug=args.debug).start() elif args.subcommand == 'stop': AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile).stop() else: logger.error("%sInvalid command '%s'%s", COLOR_RED, COLOR_NORM) sys.exit(1)
def __init__(self): self.client = KubernetesApiClient(use_proxy=True) self.batchapi = self.client.batchv self.kube_namespace = "axuser" self.jobname = None self.service = None # this is the argo.services.service.Service object self._host_vols = [] self._name_id = AXClusterId().get_cluster_name_id() self._s3_bucket_ax_is_external = AXLogPath(self._name_id).is_external() self._s3_bucket_ax = AXLogPath(self._name_id).bucket() self._s3_key_prefix_ax = AXLogPath(self._name_id).artifact() self._s3_bucket = AXClusterDataPath(self._name_id).bucket() self._s3_key_prefix = AXClusterDataPath(self._name_id).artifact() self._attribute_map = {"uuid": "metadata.uid"} self.software_info = SoftwareInfo() self._ax_resources = {}
def __init__(self, name, application): """ Each deployment has a name and needs to be part of an application Application maps to a kubernetes namespace and the deployment will be created in this namespace. Args: name: deployment name application: the application that this deployment runs under """ self.name = name self.application = application self.client = KubernetesApiClient(use_proxy=True) self._nameid = AXClusterId().get_cluster_name_id() self._software_info = SoftwareInfo() self._app_obj = Application(application) self.spec = None
def __init__( self, cluster_name_id=None, aws_profile=None, debug=True, manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot, config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile, software_info=None): """ AX Platform bootstrap :param cluster_name_id: cluster name id :param aws_profile: aws profile to authenticate all aws clients :param debug: debug mode :param manifest_root: root directory to all ax service objects """ self._software_info = software_info if software_info else SoftwareInfo( ) assert isinstance( self._software_info, SoftwareInfo ), "Wrong type ({}) of software info passed in.".format( self._software_info) self._aws_profile = aws_profile self._manifest_root = manifest_root self._config = AXPlatformConfig(config_file) logger.info("Using Kubernetes manifest from %s", self._manifest_root) logger.info("Using platform configuration \"%s\" from %s", self._config.name, config_file) self._cluster_name_id = AXClusterId( cluster_name_id).get_cluster_name_id() self._cluster_config = AXClusterConfig( cluster_name_id=self._cluster_name_id, aws_profile=self._aws_profile) self._cluster_config_path = AXClusterConfigPath(cluster_name_id) self._cluster_info = AXClusterInfo(self._cluster_name_id, aws_profile=self._aws_profile) self._region = self._cluster_config.get_region() if Cloud().target_cloud_aws(): self._account = AWSAccountInfo( aws_profile=self._aws_profile).get_account_id() else: self._account = "" self._bucket_name = self._cluster_config_path.bucket() self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile, region=self._region) # In debug mode, when we failed to create an object, we don't delete it but just # leave it for debug. self._debug = debug # DNS self.cluster_dns_name = None # Get kube cluster config. Automatic if in pod already. self._kube_config = self._cluster_info.get_kube_config_file_path( ) if self._cluster_name_id else None if self._cluster_name_id: if not os.path.isfile(self._kube_config): logger.info( "Can't find config file at %s; downloading from s3", self._kube_config) self._kube_config = self._cluster_info.download_kube_config() assert os.path.isfile( self._kube_config), "No kube_config file available" # Kubernetes related objects and macros self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER] self.kube_axsys_namespace = AXNameSpaces.AXSYS self.kube_user_namespace = AXNameSpaces.AXUSER self.kubectl = KubernetesApiClient(config_file=self._kube_config) self.kube_poll = KubeObjPoll(kubectl=self.kubectl) self._monitor = AXKubeMonitor(kubectl=self.kubectl) self._monitor.reload_monitors(namespace=self.kube_axsys_namespace) self._monitor.start() # Kube Objects self._kube_objects = {} self._replacing = {}
class ClusterOperationBase(with_metaclass(abc.ABCMeta, object)): def __init__(self, cluster_name, cluster_id=None, cloud_profile=None, generate_name_id=False, dry_run=True): if cluster_id: input_name = "{}-{}".format(cluster_name, cluster_id) else: input_name = cluster_name self._idobj = AXClusterId(name=input_name, aws_profile=cloud_profile) if generate_name_id: # This is used during installation to pre-generate cluster name id record try: self._idobj.get_cluster_name_id() except Exception as e: logger.info( "Cannot find cluster name id: %s. Cluster is not yet created.", e) self._idobj.create_cluster_name_id() self._csm = ClusterStateMachine( cluster_name_id=self._idobj.get_cluster_name_id(), cloud_profile=cloud_profile) self._dry_run = dry_run def start(self): self.pre_run() self.run() self.post_run() @abc.abstractmethod def run(self): """ Main operation logics :return: """ pass @abc.abstractmethod def pre_run(self): """ Pre run actions, mainly setup / validations :return: """ pass @abc.abstractmethod def post_run(self): """ Post run actions, i.e. cleanups :return: """ pass def _persist_cluster_state_if_needed(self): if self._dry_run: logger.info("DRY RUN: not persisting cluster state") else: self._csm.persist_state()
def axmon_artifacts_base(): name_id = AXClusterId().get_cluster_name_id() account = AXClusterConfigPath(name_id).bucket() cluster_artifacts = AXClusterConfigPath(name_id).artifact() return jsonify(result="/{}/{}".format(account, cluster_artifacts))