예제 #1
0
class AXClusterId(with_metaclass(Singleton, object)):
    def __init__(self, name=None, aws_profile=None):
        self._input_name = name
        self._aws_profile = aws_profile

        # Cluster id related bucket and path info should be self-contained rather than
        # using config_s3_path object. Because config_s3_path needs both cluster name
        # and id to initialize. In case we haven't get cluster id yet, singletons in
        # config_s3_path cannot be properly initialized.
        self._bucket_template = "applatix-cluster-{account}-{seq}"
        self._cluster_id_bucket_path_template = "{name}/id"

        # Set bucket
        self._customer_id = AXCustomerId().get_customer_id()
        self._bucket_name = self._bucket_template.format(
            account=self._customer_id, seq=0)
        self._bucket = None

        # These values will be set when user calls get/create cluster name id
        self._cluster_name = None
        self._cluster_id = None
        self._cluster_name_id = None

    def create_cluster_name_id(self):
        """
        User input cluster name in format of "<name>" or "<name>-<id>", and this function creates
        a record in S3. If he name caller passed in does not include an ID, we generate one.

        If we already have a cluster name/id record in s3, this function should not be called to avoid
        existing clusters's records to get overridden
        :return: <cluster-name>-<cluster-id>
        """
        assert not self._cluster_name_id, "Cluster {} has it's name id already created".format(
            self._cluster_name_id)
        assert self._input_name, "Must provide input name to create cluster name id"
        name, cid = self._format_name_id(self._input_name)
        if cid is None:
            logger.info("Cluster id not provided, generate one.")
            if Cloud().target_cloud_gcp():
                cid = str(uuid.uuid4())[:8]
            elif Cloud().target_cloud_aws():
                cid = str(uuid.uuid1())
            else:
                assert False, "Must provide valid target cloud to create cluster name id. Currently target cloud is set to {}".format(
                    Cloud().target_cloud())
        logger.info("Created new name-id %s", name + "-" + cid)

        # fill in cluster name id info
        self._cluster_name = name
        self._cluster_id = cid
        self._cluster_name_id = self._cluster_name + "-" + self._cluster_id
        return self._cluster_name_id

    def upload_cluster_name_id(self):
        """
        This function assumes cluster_name_id has been created already
        """
        logger.info("Uploading cluster name-id record to S3 ...")
        self._load_cluster_name_id_if_needed()
        self._instantiate_bucket_if_needed()
        id_key = self._cluster_id_bucket_path_template.format(
            name=self._cluster_name)
        self._bucket.put_object(id_key, self._cluster_id)
        logger.info("Uploaded cluster name (%s) and cluster id (%s) to S3",
                    self._cluster_name, self._cluster_id)

    def get_cluster_name_id(self):
        """
        This function assumes cluster name/id record is created. It first looks for
        AX_CLUSTER_NAME_ID env, if not set, it looks up cluster id from s3.
        :return" cluster_name_id
        """
        self._load_cluster_name_id_if_needed()
        return self._cluster_name_id

    def get_cluster_name(self):
        self._load_cluster_name_id_if_needed()
        return self._cluster_name

    def get_cluster_id(self):
        self._load_cluster_name_id_if_needed()
        return self._cluster_id

    def get_cluster_id_s3_key(self):
        self._load_cluster_name_id_if_needed()
        return self._cluster_id_bucket_path_template.format(
            name=self._cluster_name)

    def _load_cluster_name_id_if_needed(self):
        if not self._cluster_name_id:
            self._load_cluster_name_id()

    def _instantiate_bucket_if_needed(self):
        if not self._bucket:
            logger.info("Instantiating cluster bucket ...")
            self._bucket = Cloud().get_bucket(self._bucket_name,
                                              aws_profile=self._aws_profile)
            assert self._bucket.exists(), "Bucket {} not created yet".format(
                self._bucket.get_bucket_name())

    def _load_cluster_name_id(self):
        """
        This function assumes cluster name/id record is created. It first looks for
        AX_CLUSTER_NAME_ID env, if not set, it looks up cluster id from s3.

        This function sets cluster_name_id, cluster_name, and cluster_id
        """
        # Try to get from env first
        name_id = os.getenv(CLUSTER_NAME_ID_ENV_NAME, None)
        if name_id:
            logger.info("Found cluster name id in env: %s", name_id)
            self._cluster_name_id = name_id
            self._cluster_name, self._cluster_id = self._format_name_id(
                self._cluster_name_id)

            # NOTE: if we find some cluster name id we cannot even parse from env, we still fail
            # directly even though it is possible that we might find something valid from s3 bucket,
            # as the program that brings up program (i.e. axinstaller) is already having trouble in
            # such case, which is already alerting
            assert self._cluster_name and self._cluster_id, "Failed to load cluster name and cluster id from env"
        else:
            self._lookup_id_from_bucket()
            assert self._cluster_name and self._cluster_id, "Failed to load cluster name and cluster id from bucket"
            self._cluster_name_id = "{}-{}".format(self._cluster_name,
                                                   self._cluster_id)

    def _lookup_id_from_bucket(self):
        name, requested_cid = self._format_name_id(self._input_name)

        # Look up assumes bucket already exists, so there is no need to pass region
        # If bucket does not exist, AXS3Bucket will throw exception
        self._instantiate_bucket_if_needed()
        id_s3_key = self._cluster_id_bucket_path_template.format(name=name)
        cid = str(self._bucket.get_object(id_s3_key)).strip()
        if cid != "None":
            logger.info("Found existing cluster name %s-%s", name, cid)
            if cid != requested_cid:
                logger.info(
                    "Ignore requested cluster ID (%s). Real cluster id: %s",
                    requested_cid, cid)
            self._cluster_name = name
            self._cluster_id = cid
        else:
            logger.info("Cannot find cluster name/id mapping from bucket")
            if requested_cid:
                logger.info(
                    "Using user defined cluster name: %s, cluster id: %s",
                    name, requested_cid)
                self._cluster_name = name
                self._cluster_id = requested_cid

    @staticmethod
    def _format_name_id(input_name):
        if Cloud().target_cloud_aws():
            return AXClusterNameIdParser.parse_cluster_name_id_aws(input_name)
        elif Cloud().target_cloud_gcp():
            return AXClusterNameIdParser.parse_cluster_name_id_gcp(input_name)
        else:
            assert False, "Invalid cloud provider: {}. Only aws and gcp are supported".format(
                Cloud().target_cloud())
예제 #2
0
class PodLogManager(object):
    """
    This manager spins up threads that run as daemon along with `wait_for_container()`. It uses
    inotify to monitor changes inside log directory and uploads rotated logs
    to S3 bucket.

    It does NOT handle logs that are not rotated - it's container_outer_executor's job
    This thread manages logs for 1 container

    Kubernetes has docker-container configuration for logrotate as follows in their salt

        /var/lib/docker/containers/*/*-json.log {
            rotate 5
            copytruncate
            missingok
            notifempty
            compress
            maxsize 10M
            daily
            dateext
            dateformat -%Y%m%d-%s
            create 0644 root root
        }

    """
    def __init__(self,
                 pod_name,
                 service_id,
                 root_id,
                 leaf_full_path,
                 namespace="axuser",
                 app_mode=False):
        """
        Initialize information.
        :param pod_name: We collect log for this pod
        :param service_id: ServiceID (job) / DeploymentID (application)
        :param root_id: WorkflowID (job) / ApplicationID (application)
        :param leaf_full_path: WorkflowPath (job) / DeploymentName (application)
        :param app_mode: upload xxx-json.log upon termination
        :param apprecord ApplicationRecord singleton
        """
        self._pod_name = pod_name
        self._namespace = namespace
        self._kubectl = KubernetesApiClient()

        self._service_id = service_id
        self._root_id = root_id
        self._leaf_full_path = leaf_full_path
        self._log_root = os.getenv("LOGMOUNT_PATH")
        # key:val = cid:cname
        self._container_info = {}
        self._local_log_dirs = {}
        self._bucket = None
        self._log_s3_prefix = None
        self._bucket_ax = None
        self._log_s3_prefix_ax = None

        self._collectors = {}
        self._app_mode = app_mode

        self._set_s3()

    def _set_s3(self):
        """
        Set bucket, log_s3_prefix, s3_processor
        """
        logger.info("Setting up s3 ...")

        cluster_name_id = AXClusterId().get_cluster_name_id()

        self._bucket_name = AXClusterDataPath(cluster_name_id).bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name)
        artifact_prefix = AXClusterDataPath(cluster_name_id).artifact()
        self._log_s3_prefix = artifact_prefix

        self._bucket_ax_is_external = AXLogPath(cluster_name_id).is_external()
        self._bucket_name_ax = AXLogPath(cluster_name_id).bucket()
        self._bucket_ax = Cloud().get_bucket(self._bucket_name_ax)
        artifact_prefix_ax = AXLogPath(cluster_name_id).artifact()

        self._log_s3_prefix_ax = artifact_prefix_ax

        assert self._bucket.exists(), "S3 bucket {} DOES NOT exist".format(
            self._bucket_name)
        assert self._bucket_ax.exists(), "S3 bucket {} DOES NOT exist".format(
            self._bucket_name_ax)
        logger.info("Using S3 bucket %s, with log prefix %s",
                    self._bucket.get_bucket_name(), self._log_s3_prefix)
        logger.info("Using S3 bucket %s, with log prefix %s for AX",
                    self._bucket_ax.get_bucket_name(), self._log_s3_prefix_ax)

    def start_log_watcher(self, cname, cid):
        logger.info("Starting log collector for container %s (%s)", cname, cid)
        path = os.path.join(self._log_root, cid)
        if cid in self._collectors:
            logger.info(
                "Log collector for container %s (%s) has already started",
                cname, cid)
            return
        assert os.path.isdir(
            path), "Log path {} is not a valid directory".format(path)
        self._container_info[cid] = cname
        try:
            collector = ContainerLogCollector(
                pod_name=self._pod_name,
                namespace=self._namespace,
                watch_dir=path,
                cid=cid,
                cname=self._container_info[cid],
                service_id=self._service_id,
                root_id=self._root_id,
                full_path=self._leaf_full_path,
                bucket=self._bucket,
                bucket_name=self._bucket_name,
                s3_prefix=self._log_s3_prefix,
                bucket_ax_is_external=self._bucket_ax_is_external,
                bucket_ax=self._bucket_ax,
                bucket_name_ax=self._bucket_name_ax,
                s3_prefix_ax=self._log_s3_prefix_ax,
                app_mode=self._app_mode)
            self._collectors[cid] = collector
            collector.start()
            self._local_log_dirs[cid] = path
            logger.info("Watching logs on %s", path)
        except Exception as e:
            logger.exception("%s", e)

    def stop_log_watcher(self, cid):
        """
        Stop a single log watcher
        :param cid:
        :return:
        """
        if not self._collectors.get(cid, None):
            return
        self._collectors[cid].terminate()
        log_dir = self._local_log_dirs[cid]
        # Touch a file so the collectors can check its "terminate" flag
        sig_file_name = os.path.join(log_dir, ".ax_go_ipo")
        try:
            subprocess.check_call(["touch", sig_file_name])
            subprocess.check_call(["rm", sig_file_name])
        except subprocess.CalledProcessError as cpe:
            logger.error("Cannot create sigfile with error %s", cpe)
        self._collectors[cid].join()
        self._collectors.pop(cid, None)

    def terminate(self):
        for cid in list(self._collectors.keys()):
            self.stop_log_watcher(cid)
        logger.info("All log collectors terminated")

    def is_active(self):
        return len(self._collectors) > 0

    def get_containers(self):
        return self._collectors.keys()