コード例 #1
0
 def _delete_data_bucket(self):
     logger.info(
         "Deleting applatix-data bucket contents for cluster %s ...",
         self._name_id)
     data_bucket = Cloud().get_bucket(
         AXClusterDataPath(name_id=self._name_id).bucket(),
         aws_profile=self._aws_profile,
         region=self._aws_region)
     cluster_name = AXClusterId(name=self._name_id).get_cluster_name()
     prefix = cluster_name + "/"
     logger.info(
         "Deleting objects for cluster %s from bucket %s. This may take some while.",
         cluster_name, data_bucket.get_bucket_name())
     data_bucket.delete_all(obj_prefix=prefix)
     logger.info("Deleting objects for cluster %s from bucket %s ... DONE",
                 cluster_name, data_bucket.get_bucket_name())
コード例 #2
0
    def __init__(self, containername, customer_image, namespace, version):
        s = SoftwareInfo()
        super(ArtifactsContainer, self).__init__(
            containername, "{}/{}/artifacts:{}".format(s.registry, namespace,
                                                       version))

        # artifacts scratch space
        self._artifacts_scratch = ContainerVolume(
            "artifacts-scratch",
            ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH)
        self._artifacts_scratch.set_type("EMPTYDIR")
        self.add_volume(self._artifacts_scratch)

        # create a hostpath for docker-socket-dir. This is used to for running docker inspect
        socket_hostpath = ContainerVolume("docker-socket-file",
                                          "/var/run/docker.sock")
        socket_hostpath.set_type("HOSTPATH", "/var/run/docker.sock")
        self.add_volume(socket_hostpath)

        # emptydir for sharing for copying static binaries from init container
        # so that they are available in the main container
        self._static_bins = ContainerVolume("static-bins", "/copyto")
        self._static_bins.set_type("EMPTYDIR")
        self.add_volume(self._static_bins)

        # add environment vars needed for artifacts
        self.add_env("AX_TARGET_CLOUD", value=Cloud().target_cloud())
        self.add_env("AX_CLUSTER_NAME_ID",
                     value=AXClusterId().get_cluster_name_id())
        self.add_env("AX_CUSTOMER_ID", value=AXCustomerId().get_customer_id())
        self.add_env("AX_CUSTOMER_IMAGE_NAME", value=customer_image)
        self.add_env("AX_ARTIFACTS_SCRATCH",
                     value=ArtifactsContainer.ARTIFACTS_CONTAINER_SCRATCH)
        self.add_env("AX_POD_NAME", value_from="metadata.name")
        self.add_env("AX_POD_IP", value_from="status.podIP")
        self.add_env("AX_POD_NAMESPACE", value_from="metadata.namespace")
        self.add_env("AX_NODE_NAME", value_from="spec.nodeName")
        self.add_env("ARGO_LOG_BUCKET_NAME",
                     os.getenv("ARGO_LOG_BUCKET_NAME", ""))

        annotation_vol = ContainerVolume("annotations", "/etc/axspec")
        annotation_vol.set_type("DOWNWARDAPI", "metadata.annotations")
        self.add_volume(annotation_vol)

        # AA-3175: CPU and memory are set to lowest possible so that pod requests are kept at a minimum
        self.add_resource_constraints("cpu_cores", 0.001)
        self.add_resource_constraints("mem_mib", 4)
コード例 #3
0
    def validate(self):
        all_errs = []
        all_errs += self._validate_critical_directories()

        if not self.cluster_name:
            all_errs.append("Please provide cluster name to pause the cluster")

        if self.cloud_provider not in Cloud.VALID_TARGET_CLOUD_INPUT:
            all_errs.append(
                "Cloud provider {} not supported. Please choose from {}".
                format(self.cloud_provider, Cloud.VALID_TARGET_CLOUD_INPUT))
        else:
            # Cloud singleton should be instantiated during validation stage so
            # we can ensure customer ID
            Cloud(target_cloud=self.cloud_provider)

        return all_errs
コード例 #4
0
ファイル: ax_cluster_info.py プロジェクト: teddybearz/argo
 def download_kube_key(self):
     """
     Get kube ssh key from S3 and save it in file
     """
     if Cloud().target_cloud_gcp():
         return
     logger.info("Downloading cluster ssh key from s3 ...")
     data = self._bucket.get_object(self._s3_cluster_ssh_key)
     assert data is not None, "No kube ssh key at {}/{}".format(self._bucket_name, self._s3_cluster_ssh_key)
     dir = os.path.dirname(self._key_file)
     if not os.path.exists(dir):
         os.makedirs(dir)
     with open(self._key_file, "w") as f:
         f.write(data)
     os.chmod(self._key_file, 0o0600)
     logger.info("Downloaded kube ssh key from %s/%s to %s", self._bucket_name, self._s3_cluster_ssh_key, self._key_file)
     return self._key_file
コード例 #5
0
ファイル: axtool.py プロジェクト: zhan849/argo
    def platform(self, args):
        from ax.platform.platform import AXPlatform
        from ax.meta import AXClusterId
        from ax.platform_client.env import AXEnv

        Cloud().set_target_cloud(args.target_cloud)

        assert AXEnv().is_in_pod() or args.cluster_name, "Must specify cluster name from outside cluster"
        name_id = AXClusterId(args.cluster_name, args.aws_profile).get_cluster_name_id()
        if args.subcommand == 'start':
            AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile, debug=args.debug).start()
        elif args.subcommand == 'stop':
            AXPlatform(cluster_name_id=name_id, aws_profile=args.aws_profile).stop()

        else:
            logger.error("%sInvalid command '%s'%s", COLOR_RED, COLOR_NORM)
            sys.exit(1)
コード例 #6
0
    def _update_data_bucket(self):
        data_bucket = Cloud().get_bucket(
            AXClusterDataPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)

        if not data_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                data_bucket.get_bucket_name()))

        if self.cluster_config.get_cluster_provider() != ClusterProvider.USER:
            # Update CORS config for data bucket too.
            logger.info("Checking CORS config for %s.",
                        data_bucket.get_bucket_name())
            data_bucket.put_cors(DATA_CORS_CONFIG)

        logger.info("Created %s bucket ... DONE",
                    data_bucket.get_bucket_name())
コード例 #7
0
ファイル: install_options.py プロジェクト: gnadaraj/argo
    def __init__(self, cfg):
        cfg.cluster_size = AXClusterSize.CLUSTER_USER_PROVIDED
        cfg.cloud_profile = "default"
        cfg.cluster_type = "standard"
        cfg.vpc_id = None
        cfg.vpc_cidr_base = None
        cfg.subnet_mask_size = None
        cfg.trusted_cidrs = ClusterInstallDefaults.TRUSTED_CIDR
        cfg.user_on_demand_nodes = None
        cfg.spot_instances_option = "none"
        cfg.cluster_autoscaling_scan_interval = None
        cfg.support_object_store_name = ""
        cfg.enable_sandbox = None
        cfg.software_version_info = None

        self.cluster_size = cfg.cluster_size
        if cfg.cloud_provider == "minikube":
            self.service_manifest_root = "/ax/config/service/argo-wfe"
            self.platform_bootstrap_config = "/ax/config/service/config/argo-wfe-platform-bootstrap.cfg"
            Cloud(target_cloud="aws")
        else:
            self.service_manifest_root = "/ax/config/service/argo-all"
            self.platform_bootstrap_config = "/ax/config/service/config/argo-all-platform-bootstrap.cfg"

        super(PlatformOnlyInstallConfig, self).__init__(cfg)
        self.install_config = ClusterInstallConfig(cfg=cfg)
        self.install_config.validate()

        self.cluster_bucket = cfg.cluster_bucket
        self.kube_config = cfg.kubeconfig
        try:
            self.bucket_endpoint = cfg.endpoint
            self.access_key = cfg.access_key
            self.secret_key = cfg.secret_key
        except Exception as ae:
            self.bucket_endpoint = None
            self.access_key = None
            self.secret_key = None

        # Overwrite the manifest_root and bootstrap_config.
        self.install_config.manifest_root = self.service_manifest_root
        self.install_config.bootstrap_config = self.platform_bootstrap_config

        return
コード例 #8
0
ファイル: ax_cluster_info.py プロジェクト: teddybearz/argo
    def __init__(self, cluster_name_id, kube_config=None, key_file=None, metadata=None, aws_profile=None):
        """
        Config file initialization

        :param cluster_name_id: Cluster name_id in format of name-uuid, lcj-cluster-515d9828-7515-11e6-9b3e-a0999b1b4e15
        :param kube_config: kubernetes saved config file.
        :param key_file: cluster ssh key path
        :param metadata: path to cluster metadata
        :param aws_profile: AWS profile to access S3.
        """
        assert AXEnv().is_in_pod() or cluster_name_id, "Must specify cluster name from outside cluster"
        self._aws_profile = aws_profile
        self._cluster_name_id = cluster_name_id

        self._config = AXClusterConfig(cluster_name_id=cluster_name_id, aws_profile=aws_profile)
        self._kube_config = kube_config if kube_config else self.default_config_path.format(cluster_name_id)
        self._key_file = key_file if key_file else self.default_key_path.format(cluster_name_id)
        self._metadata_file = metadata if metadata else self.default_cluster_meta_path

        config_path = AXClusterConfigPath(name_id=cluster_name_id)
        self._bucket_name = config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=aws_profile)
        self._s3_kube_config_key = config_path.kube_config()
        self._s3_cluster_ssh_key = config_path.kube_ssh()
        self._s3_cluster_state_before_pause = config_path.state_before_pause()
        self._s3_cluster_meta = config_path.cluster_metadata()
        self._s3_cluster_software_info = config_path.versions()
        self._s3_platform_manifest_dir = config_path.platform_manifest_dir()
        self._s3_platform_config = config_path.platform_config()

        self._s3_master_config_prefix = config_path.master_config_dir()
        self._s3_master_attributes_path = config_path.master_attributes_path()
        self._s3_master_user_data_path = config_path.master_user_data_path()

        # For cluster staging info, stage1 and stage2 can be uploaded, downloaded, deleted with AXClusterInfo
        # stage0 will can only be downloaded with AXClusterInfo. It will be uploaded during cluster information
        # initialization (i.e. upload cluster id an cluster config), and deleted during cluster information
        # clean up (i.e. during axinstaller uninstall)
        self._staging_info = {
            "stage0": config_path.cluster_install_stage0_key(),
            "stage1": config_path.cluster_install_stage1_key(),
            "stage2": config_path.cluster_install_stage2_key()
        }
コード例 #9
0
ファイル: container_specs.py プロジェクト: gnadaraj/argo
    def __init__(self, size_in_mb):
        super(SidecarDockerDaemon, self).__init__(DIND_CONTAINER_NAME,
                                                  "argoproj/dind:1.12.6")

        # Add lib modules for dind to load aufs module.
        libmodule_hostpath = ContainerVolume("kernel-lib-module",
                                             "/lib/modules")
        libmodule_hostpath.set_type("HOSTPATH", "/lib/modules")
        self.add_volume(libmodule_hostpath)

        # Add per node dgs to sidecar
        dgs_vol = ContainerVolume("docker-graph-storage", "/var/lib/docker")
        if Cloud().own_cloud() == Cloud.CLOUD_AWS:
            dgs_vol.set_type("DOCKERGRAPHSTORAGE", size_in_mb)
        else:
            dgs_vol.set_type("EMPTYDIR")
        self.add_volume(dgs_vol)

        # dind daemon needs to be privileged!
        self.privileged = True
コード例 #10
0
    def validate(self):
        all_errs = []
        all_errs += self._validate_critical_directories()

        # Because we have strict validation during installation, so we can assume
        # cluster has a valid name and cluster config
        if not self.cluster_name:
            all_errs.append("Please provide cluster name to pause the cluster")

        if self.cloud_provider not in Cloud.VALID_TARGET_CLOUD_INPUT:
            all_errs.append(
                "Cloud provider {} not supported. Please choose from {}".
                format(self.cloud_provider, Cloud.VALID_TARGET_CLOUD_INPUT))
        else:
            # Cloud singleton should be instantiated during validation stage so
            # we can ensure customer ID
            Cloud(target_cloud=self.cloud_provider)

        all_errs += validate_software_info(self.target_software_info)

        return all_errs
コード例 #11
0
    def install_argo_only(self, args):
        logger.info("Installing Argo platform ...")

        try:
            assert args.cluster_name
        except Exception:
            print("--cluster-name needs to be specified")
            sys.exit(1)

        if args.cloud_provider == "minikube" and not args.bucket_endpoint:
            Cloud(target_cloud="aws")
            args.cluster_bucket = "argo"
            # TODO:revisit
            # access key and secret is required by code in aws_s3
            # use dummy access key and secret for s3proxy
            args.access_key = "fake-access-key"
            args.secret_key = "fake-secret-key"
            self._install_s3_proxy(args.kubeconfig)
            args.bucket_endpoint = self._get_s3_proxy_endpoint(args.kubeconfig)
            # Create bucket
            self._create_s3_proxy_bucket(args.bucket_endpoint, args.cluster_bucket)
        elif args.cloud_provider == "aws":
            assert args.cluster_bucket, "--cluster-bucket is required"
            assert args.cloud_region, "--cloud-region is required"
        elif args.cloud_provider == "gke":
            assert args.cluster_bucket, "--cluster-bucket is required"

        logger.info("s3 bucket endpoint: %s", args.bucket_endpoint)

        os.environ["AX_CUSTOMER_ID"] = "user-customer-id"
        os.environ["ARGO_LOG_BUCKET_NAME"] = args.cluster_bucket
        os.environ["ARGO_DATA_BUCKET_NAME"] = args.cluster_bucket
        os.environ["ARGO_KUBE_CONFIG_PATH"] = args.kubeconfig
        os.environ["AX_TARGET_CLOUD"] = Cloud.CLOUD_AWS

        self._set_env_if_present(args)
        platform_install_config = PlatformOnlyInstallConfig(cfg=args)
        PlatformOnlyInstaller(platform_install_config).run()
        return
コード例 #12
0
ファイル: cluster_upgrader.py プロジェクト: zhan849/argo
    def _upgrade_kube(self):
        """
        This function calls our script to upgrade Kubernetes and cluster nodes
        :return:
        """
        env = {
            "CLUSTER_NAME_ID": self._name_id,
            "AX_CUSTOMER_ID": AXCustomerId().get_customer_id(),
            "OLD_KUBE_VERSION": self._current_software_info.kube_version,
            "NEW_KUBE_VERSION": self._cfg.target_software_info.kube_version,
            "NEW_CLUSTER_INSTALL_VERSION":
            self._cfg.target_software_info.kube_installer_version,
            "ARGO_AWS_REGION": self._cluster_config.get_region(),
            "AX_TARGET_CLOUD": Cloud().target_cloud()
        }

        if self._cfg.cloud_profile:
            env["ARGO_AWS_PROFILE"] = self._cfg.cloud_profile

        logger.info("Upgrading Kubernetes with environments %s", pformat(env))
        env.update(os.environ)
        subprocess.check_call(["upgrade-kubernetes"], env=env)
コード例 #13
0
    def _get_bucket_region_from_aws(self):
        # We assume cluster is not access any resource outside partition, e.g.
        # clusters in partition "aws" will not access resource in partition "aws-us-gov"
        instance_region = Cloud().meta_data().get_region()
        s3 = boto3.Session(
            profile_name=self._aws_profile,
            region_name=instance_region
        ).client("s3", config=Config(signature_version='s3v4'))

        logger.debug("Finding region for bucket %s from with initial region %s", self._name, instance_region)
        try:
            response = s3.head_bucket(Bucket=self._name)
            logger.debug("Head_bucket returned OK %s", response)
        except ClientError as e:
            if "Not Found" in str(e):
                return None
            response = getattr(e, "response", {})
            logger.debug("Head_bucket returned error %s, inspecting headers", response)
        headers = response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
        region = headers.get("x-amz-bucket-region", headers.get("x-amz-region", None))
        logger.debug("Found region %s from head_bucket for %s, headers %s", region, self._name, headers)
        return region
コード例 #14
0
    import traceback
    from greenlet import greenlet

    for ob in gc.get_objects():
        if not isinstance(ob, greenlet):
            continue
        if not ob:
            continue
        logger.debug(''.join(traceback.format_stack(ob.gr_frame)))


if __name__ == "__main__":
    """
    Main entry point for AXmon.
    """
    parser = argparse.ArgumentParser(description='AXMon')
    parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__))
    parser.add_argument('--port', type=int, default=AXMON_DEFAULT_PORT, help="Run server on the specified port")
    args = parser.parse_args()

    # Basic logging.
    logging.basicConfig(format="%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s")
    logging.getLogger("ax").setLevel(logging.DEBUG)
    logging.getLogger("botocore").setLevel(logging.WARNING)
    logging.getLogger("boto3").setLevel(logging.WARNING)

    Cloud().set_target_cloud(Cloud().own_cloud())
    signal.signal(signal.SIGUSR1, debug)
    axmon_rest_start(port=args.port)
    AXMon().run()
コード例 #15
0
ファイル: container_waiter.py プロジェクト: nuaays/argo
def wait_for_container(jobname, podname, containername, artifact_scratch_path,
                       out_label):
    # start the waiter but it is possible that the event has passed so
    # poll the status once after the waiter is registered and then go
    # to sleep if the container is still running.

    global dind_container_id

    def get_container_status(s):
        c_status = s.get("containerStatuses", None)
        main_container_status = None
        dind_container_status = None
        docker_ids = {}
        for c in c_status or []:
            name = c.get("name", None)
            if not name:
                continue
            if name == containername:
                main_container_status = c
            elif name == DIND_CONTAINER_NAME:
                dind_container_status = c
            cid = c.get("containerID", None)
            if cid:
                l = len("docker://")
                docker_id = cid[l:]
                logger.debug("Docker ID for {} is {}".format(name, docker_id))
                docker_ids[name] = docker_id

        return main_container_status, dind_container_status, docker_ids

    def check_pod_status(pod_status):
        status = pod_status.status
        assert isinstance(status, swagger_client.V1PodStatus
                          ), "Expect to see an object of type V1PodStatus"
        status_dict = swagger_client.ApiClient().sanitize_for_serialization(
            status)
        logger.debug("status_dict=%s", status_dict)

        main_container_status, dind_container_status, docker_ids = get_container_status(
            status_dict)
        if main_container_status is None:
            if status_dict.get("phase", None) == "Pending":
                logger.debug("Pod still in pending state")
                return False
            else:
                logger.error("bad input %s", status_dict)
                logger.error(
                    "Could not find container %s in containerStatuses array",
                    containername)
                return False

        try:
            x = main_container_status["state"]["terminated"]
            logger.debug("Current terminated state object is %s", x)
            k8s_info = {"container_status": {}}
            try:
                k8s_info["pod_ip"] = status.pod_ip
                k8s_info["host_ip"] = status.host_ip
                k8s_info["start_time"] = status.start_time
            except Exception:
                pass
            if x is not None:
                try:
                    k8s_info["container_status"][containername] = x
                except Exception:
                    pass
                try:
                    k8s_info["container_status"][
                        DIND_CONTAINER_NAME] = dind_container_status["state"][
                            "terminated"]
                except Exception:
                    pass

                assert docker_ids, "docker_id should be valid when container terminates"
                with open("/docker_id.txt", "w") as f:
                    f.write(json.dumps(docker_ids))
                with open("/k8s_info.txt", "w") as f:
                    f.write(json.dumps(k8s_info))
                if DIND_CONTAINER_NAME in docker_ids:
                    global dind_container_id
                    dind_container_id = docker_ids[DIND_CONTAINER_NAME]
                return True
            else:
                return False
        except KeyError as ke:
            logger.debug(
                "Expected state of terminated state not observed. Got KerError %s",
                ke)

        return False

    logger.info("jobname=%s podname=%s containername=%s", jobname, podname,
                containername)
    node_instance_id = "user-node"
    try:
        node_instance_id = Cloud().meta_data().get_instance_id()
    except Exception:
        pass
    logger.info("Using node instance id %s, namespace %s", node_instance_id,
                NAMESPACE)

    try:
        kubelet_cli = KubeletClient()
    except Exception as e:
        host_ip = get_host_ip()
        kubelet_cli = KubeletClient(host_ip)

    # have to match with conainer_outer_executor.py
    container_done_flag_postfix = "_ax_container_done_flag"
    poll_container_done_flag_file = "{}/{}/{}".format(
        artifact_scratch_path, out_label, container_done_flag_postfix)
    service_instance_id = None
    check_file_round = 60 * 2

    count = 0
    posted_event = False
    while True:
        try:
            while True:
                count += 1

                # Kubelet client returns an iterator so we make it a list. As in a certain namespace, pod name
                # is unique, it's safe to always get pods[0]
                pods = [
                    p for p in kubelet_cli.list_namespaced_pods(
                        namespace=NAMESPACE, name=podname)
                ]
                pod_status = pods[0]
                assert isinstance(pod_status, swagger_client.V1Pod
                                  ), "Expect to see an object of type V1Pod"
                assert pod_status.metadata.name == podname

                # both containers are created so we can assume we have all the knowledge we need for posting URL
                if not posted_event:
                    try:
                        if not jobname.startswith('axworkflowexecutor'):
                            service_instance_id = post_update_to_axevent(
                                jobname, podname, containername, pod_status,
                                node_instance_id)
                        start_log_collectors(pod_name=podname,
                                             pod_status=pod_status)
                        posted_event = True
                    except Exception as e:
                        logger.exception(
                            "Could not post start event due to %s. Will retry later",
                            e)
                        time.sleep(1)
                        if count % 10 != 0:
                            continue

                done = check_pod_status(pod_status)
                logger.debug("Container %s in [%s][%s] done=%s", containername,
                             jobname, podname, done)
                if done:
                    if not posted_event:
                        try:
                            service_instance_id = post_update_to_axevent(
                                jobname, podname, containername, pod_status,
                                node_instance_id)
                            start_log_collectors(pod_name=podname,
                                                 pod_status=pod_status)
                        except Exception as e:
                            logger.exception(
                                "Could not post start event due to %s.", e)

                    # stop the dind container
                    if dind_container_id:
                        exit_code = subprocess.call([
                            "{}/docker".format(
                                ARTIFACTS_CONTAINER_SCRATCH_PATH), "kill",
                            "-s", "INT", dind_container_id
                        ])
                        # TODO: Do docker inspect in a loop and make sure that container dies with clean exit code.
                        # TODO: If exit code is non-zero then ask WFE to ensure that it needs to kill the job controller
                        # TODO: before this pod is terminated
                        logger.debug(
                            "Exit code of stopping dind container is {}".
                            format(exit_code))

                        # request axmon to delete volume
                        # sidecar still has this code for backward compatibility for tasks that were started
                        # before docker graph storage used per node vol
                        try:
                            release_volume_for_dind(service_instance_id)
                        except Exception:
                            logger.exception("cannot release_volume_for_dind")
                    return

                for _ in range(1, check_file_round):
                    if os.path.exists(poll_container_done_flag_file):
                        logger.debug("Container %s in [%s][%s] has %s",
                                     containername, jobname, podname,
                                     poll_container_done_flag_file)
                        # sleep 1 second to let container status propogate
                        time.sleep(1)
                        break
                    else:
                        time.sleep(2)
                else:
                    # after x min
                    logger.debug("No %s yet, check status again",
                                 poll_container_done_flag_file)

        except requests.exceptions.HTTPError as he:
            if "NOT FOUND" in str(he):
                logger.exception("Container %s not found, abort",
                                 containername)
                return
            else:
                time.sleep(10)
        except urllib3.exceptions.MaxRetryError:
            logger.exception("Sleep 10 seconds and retry")
            time.sleep(10)
        except Exception as e:
            logger.exception("Container %s in [%s][%s]. Exception type: %s",
                             containername, jobname, podname, type(e))
            time.sleep(10)
コード例 #16
0
ファイル: ax_kube_yaml_update.py プロジェクト: nuaays/argo
    def _generate_default_envs(self, is_daemon, resource_updated):
        """
        Add essential variables to all system containers
        :param is_daemon:
        :return:
        """
        default_envs = [
            # Kubernetes downward APIs
            {
                "name": "AX_NODE_NAME",
                "path": "spec.nodeName"
            },
            {
                "name": "AX_POD_NAME",
                "path": "metadata.name"
            },
            {
                "name": "AX_POD_NAMESPACE",
                "path": "metadata.namespace"
            },
            {
                "name": "AX_POD_IP",
                "path": "status.podIP"
            },

            # Values
            {
                "name": "DISK_MULT",
                "value": str(self.disk_mult)
            },
            {
                "name": "AX_TARGET_CLOUD",
                "value": Cloud().target_cloud()
            },
            {
                "name": "AX_CLUSTER_NAME_ID",
                "value": self._cluster_name_id
            },
            {
                "name": "AX_CUSTOMER_ID",
                "value": AXCustomerId().get_customer_id()
            },
        ]

        aws_region = os.environ.get("AX_AWS_REGION", "")
        if aws_region != "":
            default_envs.append({"name": "AX_AWS_REGION", "value": aws_region})

        if os.getenv("ARGO_S3_ACCESS_KEY_ID", "") != "":
            # Secrets
            default_envs.append({
                "name": "ARGO_S3_ACCESS_KEY_ID",
                "secret": "argo-access-key"
            })
            default_envs.append({
                "name": "ARGO_S3_ACCESS_KEY_SECRET",
                "secret": "argo-secret-key"
            })
            default_envs.append({
                "name": "ARGO_S3_ENDPOINT",
                "value": os.getenv("ARGO_S3_ENDPOINT", None)
            })

        # Special cases for daemons
        if is_daemon:
            if resource_updated:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": str(self.daemon_cpu_mult)
                    },
                    {
                        "name": "MEM_MULT",
                        "value": str(self.daemon_mem_mult)
                    },
                ]
            else:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": "1.0"
                    },
                    {
                        "name": "MEM_MULT",
                        "value": "1.0"
                    },
                ]
        else:
            default_envs += [
                {
                    "name": "CPU_MULT",
                    "value": str(self.cpu_mult)
                },
                {
                    "name": "MEM_MULT",
                    "value": str(self.mem_mult)
                },
            ]

        rst = []
        for d in default_envs:
            var = V1EnvVar()
            var.name = d["name"]

            if d.get("path", None):
                field = V1ObjectFieldSelector()
                field.field_path = d["path"]
                src = V1EnvVarSource()
                src.field_ref = field
                var.value_from = src
            elif d.get("secret", None):
                secret = V1SecretKeySelector()
                secret.key = d["secret"]
                secret.name = d["secret"]
                src = V1EnvVarSource()
                src.secret_key_ref = secret
                var.value_from = src
            else:
                var.value = d["value"]
            rst.append(var)
        return rst
コード例 #17
0
    def update(self, iam):
        """
        Create all buckets in portal account.
        """
        logger.info(
            "Creating applatix-support and applatix-upgrade buckets ...")
        support_bucket = Cloud().get_bucket(
            AXSupportConfigPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)
        upgrade_bucket = Cloud().get_bucket(
            AXUpgradeConfigPath(name_id=self._name_id).bucket(),
            aws_profile=self._aws_profile,
            region=self._aws_region)

        # Retry create while bucket is created is fine
        if not support_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                support_bucket.get_bucket_name()))

        # If policy is already there, we don't update
        if not support_bucket.get_policy():
            logger.info(
                "Argo support bucket policy does not exist, creating new one..."
            )
            if not support_bucket.put_policy(
                    policy=self._generate_bucket_policy_string(
                        template=SUPPORT_BUCKET_POLICY_TEMPLATE,
                        bucket_name=support_bucket.get_bucket_name(),
                        iam=iam)):
                raise AXPlatformException(
                    "Failed to configure policy for S3 bucket {}".format(
                        support_bucket.get_bucket_name()))

        if not upgrade_bucket.create():
            raise AXPlatformException("Failed to create S3 bucket {}".format(
                support_bucket.get_bucket_name()))

        if not upgrade_bucket.get_policy():
            logger.info(
                "Argo upgrade bucket policy does not exist, creating new one..."
            )
            if not upgrade_bucket.put_policy(
                    policy=self._generate_bucket_policy_string(
                        template=SUPPORT_BUCKET_POLICY_TEMPLATE,
                        bucket_name=upgrade_bucket.get_bucket_name(),
                        iam=iam)):
                raise AXPlatformException(
                    "Failed to configure policy for S3 bucket {}".format(
                        support_bucket.get_bucket_name()))

        # Tag them right away to avoid race deletion.
        upgrade_bucket.put_object(
            key=AXUpgradeConfigPath(name_id=self._name_id).tag(),
            data="tag",
            ACL="bucket-owner-full-control")
        support_bucket.put_object(
            key=AXSupportConfigPath(name_id=self._name_id).tag(),
            data="tag",
            ACL="bucket-owner-full-control")
        logger.info("Created %s and %s buckets ... DONE",
                    support_bucket.get_bucket_name(),
                    upgrade_bucket.get_bucket_name())
コード例 #18
0
 def __init__(self, cluster_name_id=None, aws_profile=None, config=None):
     self._cluster_name_id = AXClusterId(name=cluster_name_id, aws_profile=aws_profile).get_cluster_name_id()
     self._bucket_name = AXClusterConfigPath(self._cluster_name_id).bucket()
     self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=aws_profile)
     self._cluster_config_key = AXClusterConfigPath(self._cluster_name_id).cluster_config()
     self._conf = config
コード例 #19
0
 def __new__(cls, *args, **kwargs):
     if Cloud().target_cloud_gcp():
         from .gke_platform import AXGKEPlatform
         return super(AXPlatform, cls).__new__(AXGKEPlatform)
     else:
         return super(AXPlatform, cls).__new__(cls)
コード例 #20
0
    def _generate_default_envs(self, is_daemon, resource_updated):
        """
        Add essential variables to all system containers
        :param is_daemon:
        :return:
        """
        default_envs = [
            # Kubernetes downward APIs
            {
                "name": "AX_NODE_NAME",
                "path": "spec.nodeName"
            },
            {
                "name": "AX_POD_NAME",
                "path": "metadata.name"
            },
            {
                "name": "AX_POD_NAMESPACE",
                "path": "metadata.namespace"
            },
            {
                "name": "AX_POD_IP",
                "path": "status.podIP"
            },

            # Values
            {
                "name": "DISK_MULT",
                "value": str(self.disk_mult)
            },
            {
                "name": "AX_TARGET_CLOUD",
                "value": Cloud().target_cloud()
            },
            {
                "name": "AX_CLUSTER_NAME_ID",
                "value": self._cluster_name_id
            },
            {
                "name": "AX_CUSTOMER_ID",
                "value": AXCustomerId().get_customer_id()
            },
        ]

        # Special cases for daemons
        if is_daemon:
            if resource_updated:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": str(self.daemon_cpu_mult)
                    },
                    {
                        "name": "MEM_MULT",
                        "value": str(self.daemon_mem_mult)
                    },
                ]
            else:
                default_envs += [
                    {
                        "name": "CPU_MULT",
                        "value": "1.0"
                    },
                    {
                        "name": "MEM_MULT",
                        "value": "1.0"
                    },
                ]
        else:
            default_envs += [
                {
                    "name": "CPU_MULT",
                    "value": str(self.cpu_mult)
                },
                {
                    "name": "MEM_MULT",
                    "value": str(self.mem_mult)
                },
            ]

        rst = []
        for d in default_envs:
            var = V1EnvVar()
            var.name = d["name"]

            if d.get("path", None):
                field = V1ObjectFieldSelector()
                field.field_path = d["path"]
                src = V1EnvVarSource()
                src.field_ref = field
                var.value_from = src
            else:
                var.value = d["value"]
            rst.append(var)
        return rst
コード例 #21
0
ファイル: config_s3_path.py プロジェクト: nuaays/argo
 def bucket_exists(self):
     if self._bucket_exists is None:
         self._bucket_exists = Cloud().get_bucket(self._bucket_name).exists()
     return self._bucket_exists
コード例 #22
0
ファイル: container_waiter.py プロジェクト: nuaays/argo
if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='waiter')
    parser.add_argument('--version',
                        action='version',
                        version="%(prog)s {}".format(__version__))
    _, args = parser.parse_known_args()

    logging.basicConfig(
        format=
        "%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s"
    )
    logging.getLogger("ax").setLevel(logging.DEBUG)
    logging.getLogger("ax.kubernetes.kubelet").setLevel(logging.INFO)

    target_cloud = os.environ.get("AX_TARGET_CLOUD", Cloud().own_cloud())
    Cloud().set_target_cloud(target_cloud)

    try:
        wait_for_container(jobname=args[0],
                           podname=args[1],
                           containername=args[2],
                           artifact_scratch_path=args[3],
                           out_label=args[4])
        logger.info(
            "wait_for_container done. Waiting for log collectors to finish their jobs ..."
        )
        terminate_log_collectors()
        logger.info("Container waiter quitting ...")
    except Exception:
        logger.exception("caught exception")
コード例 #23
0
    def _container_to_pod(self, labels):

        # generate the service environment
        self._gen_service_env()

        pod_spec = PodSpec(self.name)
        pod_spec.restart_policy = "Never"

        main_container = self._container_spec()

        for vol_tag, vol in iteritems(self.service.template.inputs.volumes):
            # sanitize name for kubernetes
            vol_tag = string_to_dns_label(vol_tag)
            cvol = ContainerVolume(vol_tag, vol.mount_path)
            assert "resource_id" in vol.details and "filesystem" in vol.details, "resource_id and filesystem are required fields in volume details"
            cvol.set_type("AWS_EBS", vol_tag, vol.details["resource_id"],
                          vol.details["filesystem"])
            main_container.add_volume(cvol)
            logger.info("Mounting volume {} {} in {}".format(
                vol_tag, vol.details, vol.mount_path))

        pod_spec.add_main_container(main_container)
        wait_container = self._generate_wait_container_spec()
        target_cloud = os.environ.get("AX_TARGET_CLOUD", Cloud().own_cloud())
        wait_container.add_env("AX_TARGET_CLOUD", target_cloud)
        pod_spec.add_wait_container(wait_container)

        (cpu, mem, d_cpu, d_mem) = self._container_resources()
        main_container.add_resource_constraints("cpu_cores", cpu, limit=None)
        main_container.add_resource_constraints("mem_mib", mem, limit=mem)

        # handle artifacts
        self_sid = None
        if self.service.service_context:
            self_sid = self.service.service_context.service_instance_id

        # TODO: This function calls ax_artifact and needs to be rewritten. Ugly code.
        artifacts_container = pod_spec.enable_artifacts(
            self.software_info.image_namespace,
            self.software_info.image_version, self_sid,
            self.service.template.to_dict())
        artifacts_container.add_env("AX_JOB_NAME", value=self.name)
        artifacts_container.add_env("AX_TARGET_CLOUD", target_cloud)
        artifacts_container.add_env("ARGO_LOG_BUCKET_NAME",
                                    os.environ.get("ARGO_LOG_BUCKET_NAME"))
        artifacts_container.add_env("ARGO_DATA_BUCKET_NAME", self._s3_bucket)
        self._add_optional_envs(artifacts_container)

        secret_resources = artifacts_container.add_configs_as_vols(
            self.service.template.get_all_configs(), self.name, self.namespace)
        self._resources.insert_all(secret_resources)

        if self.service.template.docker_spec:
            dind_c = pod_spec.enable_docker(
                self.service.template.docker_spec.graph_storage_size_mib)
            dind_c.add_volumes(pod_spec.get_artifact_vols())
            dind_c.add_resource_constraints("cpu_cores", d_cpu, limit=None)
            dind_c.add_resource_constraints("mem_mib", d_mem, limit=d_mem)

        service_id = None
        if self.service.service_context:
            service_id = self.service.service_context.service_instance_id
        pod_spec.add_annotation("ax_serviceid", service_id)
        pod_spec.add_annotation("ax_costid", json.dumps(self.service.costid))
        pod_spec.add_annotation("AX_SERVICE_ENV", self._gen_service_env())

        for k in labels or []:
            pod_spec.add_label(k, labels[k])

        return pod_spec.get_spec()
コード例 #24
0
ファイル: rest.py プロジェクト: nuaays/argo
def update_cluster_sg():
    if Cloud().target_cloud_aws():
        update_cluster_sg_aws()
    elif Cloud().target_cloud_gcp():
        pass
コード例 #25
0
    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}
コード例 #26
0
ファイル: axmon.py プロジェクト: nuaays/argo
        logger.debug(''.join(traceback.format_stack(ob.gr_frame)))


if __name__ == "__main__":
    """
    Main entry point for AXmon.
    """
    parser = argparse.ArgumentParser(description='AXMon')
    parser.add_argument('--version',
                        action='version',
                        version="%(prog)s {}".format(__version__))
    parser.add_argument('--port',
                        type=int,
                        default=AXMON_DEFAULT_PORT,
                        help="Run server on the specified port")
    args = parser.parse_args()

    # Basic logging.
    logging.basicConfig(
        format=
        "%(asctime)s %(levelname)s %(name)s %(lineno)d %(threadName)s: %(message)s"
    )
    logging.getLogger("ax").setLevel(logging.DEBUG)
    logging.getLogger("botocore").setLevel(logging.WARNING)
    logging.getLogger("boto3").setLevel(logging.WARNING)

    Cloud().set_target_cloud(os.getenv("AX_TARGET_CLOUD", Cloud().own_cloud()))
    signal.signal(signal.SIGUSR1, debug)
    axmon_rest_start(port=args.port)
    AXMon().run()