def _delete_data_bucket(self): logger.info( "Deleting applatix-data bucket contents for cluster %s ...", self._name_id) data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) cluster_name = AXClusterId(name=self._name_id).get_cluster_name() prefix = cluster_name + "/" logger.info( "Deleting objects for cluster %s from bucket %s. This may take some while.", cluster_name, data_bucket.get_bucket_name()) data_bucket.delete_all(obj_prefix=prefix) logger.info("Deleting objects for cluster %s from bucket %s ... DONE", cluster_name, data_bucket.get_bucket_name())
def _update_data_bucket(self): data_bucket = Cloud().get_bucket( AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) if not data_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format( data_bucket.get_bucket_name())) if self.cluster_config.get_cluster_provider() != ClusterProvider.USER: # Update CORS config for data bucket too. logger.info("Checking CORS config for %s.", data_bucket.get_bucket_name()) data_bucket.put_cors(DATA_CORS_CONFIG) logger.info("Created %s bucket ... DONE", data_bucket.get_bucket_name())
def update(self, iam): """ Create all buckets in portal account. """ logger.info("Creating applatix-support and applatix-upgrade buckets ...") support_bucket = Cloud().get_bucket(AXSupportConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) upgrade_bucket = Cloud().get_bucket(AXUpgradeConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) # Retry create while bucket is created is fine if not support_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(support_bucket.get_bucket_name())) # If policy is already there, we don't update if not support_bucket.get_policy(): logger.info("Argo support bucket policy does not exist, creating new one...") if not support_bucket.put_policy( policy=self._generate_bucket_policy_string(template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=support_bucket.get_bucket_name(), iam=iam) ): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format(support_bucket.get_bucket_name())) if not upgrade_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(support_bucket.get_bucket_name())) if not upgrade_bucket.get_policy(): logger.info("Argo upgrade bucket policy does not exist, creating new one...") if not upgrade_bucket.put_policy( policy=self._generate_bucket_policy_string(template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=upgrade_bucket.get_bucket_name(), iam=iam) ): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format(support_bucket.get_bucket_name())) # Tag them right away to avoid race deletion. upgrade_bucket.put_object(key=AXUpgradeConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") support_bucket.put_object(key=AXSupportConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") logger.info("Created %s and %s buckets ... DONE", support_bucket.get_bucket_name(), upgrade_bucket.get_bucket_name())
class AXClusterId(with_metaclass(Singleton, object)): def __init__(self, name=None, aws_profile=None): self._input_name = name self._aws_profile = aws_profile # Cluster id related bucket and path info should be self-contained rather than # using config_s3_path object. Because config_s3_path needs both cluster name # and id to initialize. In case we haven't get cluster id yet, singletons in # config_s3_path cannot be properly initialized. self._bucket_template = "applatix-cluster-{account}-{seq}" self._cluster_id_bucket_path_template = "{name}/id" # Set bucket self._customer_id = AXCustomerId().get_customer_id() self._bucket_name = self._bucket_template.format( account=self._customer_id, seq=0) self._bucket = None # These values will be set when user calls get/create cluster name id self._cluster_name = None self._cluster_id = None self._cluster_name_id = None def create_cluster_name_id(self): """ User input cluster name in format of "<name>" or "<name>-<id>", and this function creates a record in S3. If he name caller passed in does not include an ID, we generate one. If we already have a cluster name/id record in s3, this function should not be called to avoid existing clusters's records to get overridden :return: <cluster-name>-<cluster-id> """ assert not self._cluster_name_id, "Cluster {} has it's name id already created".format( self._cluster_name_id) assert self._input_name, "Must provide input name to create cluster name id" name, cid = self._format_name_id(self._input_name) if cid is None: logger.info("Cluster id not provided, generate one.") if Cloud().target_cloud_gcp(): cid = str(uuid.uuid4())[:8] elif Cloud().target_cloud_aws(): cid = str(uuid.uuid1()) else: assert False, "Must provide valid target cloud to create cluster name id. Currently target cloud is set to {}".format( Cloud().target_cloud()) logger.info("Created new name-id %s", name + "-" + cid) # fill in cluster name id info self._cluster_name = name self._cluster_id = cid self._cluster_name_id = self._cluster_name + "-" + self._cluster_id return self._cluster_name_id def upload_cluster_name_id(self): """ This function assumes cluster_name_id has been created already """ logger.info("Uploading cluster name-id record to S3 ...") self._load_cluster_name_id_if_needed() self._instantiate_bucket_if_needed() id_key = self._cluster_id_bucket_path_template.format( name=self._cluster_name) self._bucket.put_object(id_key, self._cluster_id) logger.info("Uploaded cluster name (%s) and cluster id (%s) to S3", self._cluster_name, self._cluster_id) def get_cluster_name_id(self): """ This function assumes cluster name/id record is created. It first looks for AX_CLUSTER_NAME_ID env, if not set, it looks up cluster id from s3. :return" cluster_name_id """ self._load_cluster_name_id_if_needed() return self._cluster_name_id def get_cluster_name(self): self._load_cluster_name_id_if_needed() return self._cluster_name def get_cluster_id(self): self._load_cluster_name_id_if_needed() return self._cluster_id def get_cluster_id_s3_key(self): self._load_cluster_name_id_if_needed() return self._cluster_id_bucket_path_template.format( name=self._cluster_name) def _load_cluster_name_id_if_needed(self): if not self._cluster_name_id: self._load_cluster_name_id() def _instantiate_bucket_if_needed(self): if not self._bucket: logger.info("Instantiating cluster bucket ...") self._bucket = Cloud().get_bucket(self._bucket_name, aws_profile=self._aws_profile) assert self._bucket.exists(), "Bucket {} not created yet".format( self._bucket.get_bucket_name()) def _load_cluster_name_id(self): """ This function assumes cluster name/id record is created. It first looks for AX_CLUSTER_NAME_ID env, if not set, it looks up cluster id from s3. This function sets cluster_name_id, cluster_name, and cluster_id """ # Try to get from env first name_id = os.getenv(CLUSTER_NAME_ID_ENV_NAME, None) if name_id: logger.info("Found cluster name id in env: %s", name_id) self._cluster_name_id = name_id self._cluster_name, self._cluster_id = self._format_name_id( self._cluster_name_id) # NOTE: if we find some cluster name id we cannot even parse from env, we still fail # directly even though it is possible that we might find something valid from s3 bucket, # as the program that brings up program (i.e. axinstaller) is already having trouble in # such case, which is already alerting assert self._cluster_name and self._cluster_id, "Failed to load cluster name and cluster id from env" else: self._lookup_id_from_bucket() assert self._cluster_name and self._cluster_id, "Failed to load cluster name and cluster id from bucket" self._cluster_name_id = "{}-{}".format(self._cluster_name, self._cluster_id) def _lookup_id_from_bucket(self): name, requested_cid = self._format_name_id(self._input_name) # Look up assumes bucket already exists, so there is no need to pass region # If bucket does not exist, AXS3Bucket will throw exception self._instantiate_bucket_if_needed() id_s3_key = self._cluster_id_bucket_path_template.format(name=name) cid = str(self._bucket.get_object(id_s3_key)).strip() if cid != "None": logger.info("Found existing cluster name %s-%s", name, cid) if cid != requested_cid: logger.info( "Ignore requested cluster ID (%s). Real cluster id: %s", requested_cid, cid) self._cluster_name = name self._cluster_id = cid else: logger.info("Cannot find cluster name/id mapping from bucket") if requested_cid: logger.info( "Using user defined cluster name: %s, cluster id: %s", name, requested_cid) self._cluster_name = name self._cluster_id = requested_cid @staticmethod def _format_name_id(input_name): if Cloud().target_cloud_aws(): return AXClusterNameIdParser.parse_cluster_name_id_aws(input_name) elif Cloud().target_cloud_gcp(): return AXClusterNameIdParser.parse_cluster_name_id_gcp(input_name) else: assert False, "Invalid cloud provider: {}. Only aws and gcp are supported".format( Cloud().target_cloud())
class PodLogManager(object): """ This manager spins up threads that run as daemon along with `wait_for_container()`. It uses inotify to monitor changes inside log directory and uploads rotated logs to S3 bucket. It does NOT handle logs that are not rotated - it's container_outer_executor's job This thread manages logs for 1 container Kubernetes has docker-container configuration for logrotate as follows in their salt /var/lib/docker/containers/*/*-json.log { rotate 5 copytruncate missingok notifempty compress maxsize 10M daily dateext dateformat -%Y%m%d-%s create 0644 root root } """ def __init__(self, pod_name, service_id, root_id, leaf_full_path, namespace="axuser", app_mode=False): """ Initialize information. :param pod_name: We collect log for this pod :param service_id: ServiceID (job) / DeploymentID (application) :param root_id: WorkflowID (job) / ApplicationID (application) :param leaf_full_path: WorkflowPath (job) / DeploymentName (application) :param app_mode: upload xxx-json.log upon termination :param apprecord ApplicationRecord singleton """ self._pod_name = pod_name self._namespace = namespace self._kubectl = KubernetesApiClient() self._service_id = service_id self._root_id = root_id self._leaf_full_path = leaf_full_path self._log_root = os.getenv("LOGMOUNT_PATH") # key:val = cid:cname self._container_info = {} self._local_log_dirs = {} self._bucket = None self._log_s3_prefix = None self._bucket_ax = None self._log_s3_prefix_ax = None self._collectors = {} self._app_mode = app_mode self._set_s3() def _set_s3(self): """ Set bucket, log_s3_prefix, s3_processor """ logger.info("Setting up s3 ...") cluster_name_id = AXClusterId().get_cluster_name_id() self._bucket_name = AXClusterDataPath(cluster_name_id).bucket() self._bucket = Cloud().get_bucket(self._bucket_name) artifact_prefix = AXClusterDataPath(cluster_name_id).artifact() self._log_s3_prefix = artifact_prefix self._bucket_ax_is_external = AXLogPath(cluster_name_id).is_external() self._bucket_name_ax = AXLogPath(cluster_name_id).bucket() self._bucket_ax = Cloud().get_bucket(self._bucket_name_ax) artifact_prefix_ax = AXLogPath(cluster_name_id).artifact() self._log_s3_prefix_ax = artifact_prefix_ax assert self._bucket.exists(), "S3 bucket {} DOES NOT exist".format( self._bucket_name) assert self._bucket_ax.exists(), "S3 bucket {} DOES NOT exist".format( self._bucket_name_ax) logger.info("Using S3 bucket %s, with log prefix %s", self._bucket.get_bucket_name(), self._log_s3_prefix) logger.info("Using S3 bucket %s, with log prefix %s for AX", self._bucket_ax.get_bucket_name(), self._log_s3_prefix_ax) def start_log_watcher(self, cname, cid): logger.info("Starting log collector for container %s (%s)", cname, cid) path = os.path.join(self._log_root, cid) if cid in self._collectors: logger.info( "Log collector for container %s (%s) has already started", cname, cid) return assert os.path.isdir( path), "Log path {} is not a valid directory".format(path) self._container_info[cid] = cname try: collector = ContainerLogCollector( pod_name=self._pod_name, namespace=self._namespace, watch_dir=path, cid=cid, cname=self._container_info[cid], service_id=self._service_id, root_id=self._root_id, full_path=self._leaf_full_path, bucket=self._bucket, bucket_name=self._bucket_name, s3_prefix=self._log_s3_prefix, bucket_ax_is_external=self._bucket_ax_is_external, bucket_ax=self._bucket_ax, bucket_name_ax=self._bucket_name_ax, s3_prefix_ax=self._log_s3_prefix_ax, app_mode=self._app_mode) self._collectors[cid] = collector collector.start() self._local_log_dirs[cid] = path logger.info("Watching logs on %s", path) except Exception as e: logger.exception("%s", e) def stop_log_watcher(self, cid): """ Stop a single log watcher :param cid: :return: """ if not self._collectors.get(cid, None): return self._collectors[cid].terminate() log_dir = self._local_log_dirs[cid] # Touch a file so the collectors can check its "terminate" flag sig_file_name = os.path.join(log_dir, ".ax_go_ipo") try: subprocess.check_call(["touch", sig_file_name]) subprocess.check_call(["rm", sig_file_name]) except subprocess.CalledProcessError as cpe: logger.error("Cannot create sigfile with error %s", cpe) self._collectors[cid].join() self._collectors.pop(cid, None) def terminate(self): for cid in list(self._collectors.keys()): self.stop_log_watcher(cid) logger.info("All log collectors terminated") def is_active(self): return len(self._collectors) > 0 def get_containers(self): return self._collectors.keys()