def _set_autoscaling(self): # Prepare autoscaler asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg() or asg_manager.get_on_demand_asg() if not asg: raise AXPlatformException("Failed to get autoscaling group for cluster {}".format(self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] if asg_name is not None: self._replacing["ASG_NAME"] = asg_name else: logger.error("Autoscaling group name not found for %s", self._cluster_name_id) raise AXPlatformException("Cannot find cluster autoscaling group")
def _wait_for_pv_in_provider(self): status = self._get_from_provider() logger.debug( "Waiting for volume {} to be provision in provider: status = {}". format(self.name, status)) if status is None: raise AXPlatformException( "Cannot find volume {} in provider".format(self.name)) if status.spec.volume_name is None or status.spec.volume_name == '': raise AXPlatformException( "Cloud provider has not provisioned a volume for {}".format( self.name)) else: return
def _check_volume_in_cloud_provider(self, status): def get_pv(pv): try: response = self.client.read_persistent_volume_status(pv) return response except swagger_client.rest.ApiException as e: if e.status != 404: raise e return None # The multiplier is an order of magnitude slower as we are doing boto # calls and amazon has rate limits and api call limits that we do not # want to exceed. def is_volume_in_cloud_provider(volume): try: ec2 = boto3.resource('ec2', region_name=AWSMetaData().get_region()) vol = ec2.Volume(volume) state = vol.state logger.debug( "The current state of volume {} aws vol {} is {}".format( self.name, volume, state)) return True except botocore.exceptions.ClientError as e: code = e.response['ResponseMetadata']['HTTPStatusCode'] # 400 and 404 are for invalid volume id and volume not found if code != 404 and code != 400: raise e return False if status is None or not status.spec.volume_name: raise ValueError( "Volume {} is not ready yet in kubernetes. Need to wait a while" .format(self.name)) pv_name = status.spec.volume_name pv_obj = get_pv(pv_name) if pv_obj is None: raise AXPlatformException( "Could not get persistent volume info for {} ({})".format( self.name, pv_name)) vol_id = pv_obj.spec.aws_elastic_block_store.volume_id.split('/')[-1] if not is_volume_in_cloud_provider(vol_id): raise AXPlatformException( "Volume {} does not have underlying volume {} in cloud".format( self.name, vol_id))
def enable_portal_support(self): logger.info("Setting portal support flag ...") if not self._bucket.put_object(key=self._s3_portal_support_flag, data="True"): raise AXPlatformException( "Failed to upload cluster status before pause") logger.info("Setting portal support flag ... DONE")
def restart_master(self): started_master_id = self.discover_master( state=[EC2InstanceState.Running]) if started_master_id: logger.info("Master %s is already running", started_master_id) return stopped_master_id = self.discover_master( state=[EC2InstanceState.Stopped]) if not stopped_master_id: raise AXPlatformException( "Cannot find a previously stopped master instance") # As we can always start a "stopped" instance, any other exception will be thrown out self.client.start_instances(InstanceIds=[stopped_master_id]) logger.info("Waiting for master %s to get into state \"running\"", stopped_master_id) while True: running_master_id = self.discover_master( state=[EC2InstanceState.Running]) if running_master_id: logger.info("Master %s successfully started", running_master_id) return else: time.sleep(5)
def _process_log_gz_create(self, event, fname): if self._file_records.get(fname, None): raise AXPlatformException( "Log {} rotated while previous log is not uploaded.".format( fname)) self._file_records[fname] = event self._persist_log_artifact(fname)
def is_marked_for_deletion(self): state = self._get_from_provider() if state is None: raise AXPlatformException("Missing volume for {}".format( self.name)) return state.metadata.annotations.get("ax_deletion", "False") == "True"
def get_log_urls_for_container(pstat, podname, containername, instance_id): assert pstat.metadata.self_link, "Pod status does not have self_link" url_run = "{}/log?container={}".format(pstat.metadata.self_link, containername) cstats = pstat.status.container_statuses docker_id = None for cstat in cstats: if cstat.name != containername: continue if cstat.container_id is None: # Running: The pod has been bound to a node, and all of the containers have been created. # At least one container is still running, or is in the process of starting or restarting. raise AXPlatformException( "log urls can only be obtained after pod {} has started. Current status of container is {}" .format(podname, cstat)) docker_id = cstat.container_id[len("docker://"):] assert docker_id is not None, "Docker ID of created container {} in pod {} was not found".format( containername, podname) name_id = AXClusterId().get_cluster_name_id() bucket = AXClusterDataPath(name_id).bucket() prefix = AXClusterDataPath(name_id).artifact() url_done = "/{}/{}/{}/{}.{}.log".format(bucket, prefix, instance_id, containername, docker_id) return url_run, url_done
def get_log_urls(self, service_instance_id): cname = self.get_main_container_name() url_run = "/api/v1/namespaces/{}/pods/{}/log?container={}&follow=true".format( self.namespace, self.name, cname) docker_id = None pod = self._get_status_obj() cstats = pod.status.container_statuses for cstat in cstats: if cstat.name != cname: continue if cstat.container_id is None: # Running: The pod has been bound to a node, and all of the containers have been created. # At least one container is still running, or is in the process of starting or restarting. raise AXPlatformException( "log urls can only be obtained after pod {} has started. Current status of container is {}" .format(self.name, cstat)) docker_id = cstat.container_id[len("docker://"):] assert docker_id is not None, "Docker ID of created container {} in pod {} was not found".format( self.name, cname) name_id = AXClusterId().get_cluster_name_id() bucket = AXClusterDataPath(name_id).bucket() prefix = AXClusterDataPath(name_id).artifact() url_done = "/{}/{}/{}/{}.{}.log".format(bucket, prefix, service_instance_id, cname, docker_id) return url_run, url_done
def stop_master(self): stop_master_requested = False master_instance_id = self.discover_master(state=[EC2InstanceState.Stopping, EC2InstanceState.Stopped]) if master_instance_id: stop_master_requested = True if not stop_master_requested: master_instance_id = self.discover_master(state=["*"]) if not master_instance_id: raise AXPlatformException("Cannot find master instance") try: self.client.stop_instances(InstanceIds=[master_instance_id]) except ClientError as ce: if "UnsupportedOperation" in str(ce) and "StopInstances" in str(ce): logger.warning("Master instance %s a spot instance, which cannot be stopped.") return elif "IncorrectInstanceState" in str(ce): # Master could be in "terminating", "terminated", or "stopped" state. It does not # make sense that first 2 states could kick in, unless there is some human intervention # so the code will stuck in waiting for master to go into "stopped" state, which is # a good indication for checking manually pass else: raise ce logger.info("Waiting for master %s to get into state \"stopped\"", master_instance_id) while True: stopped_master = self.discover_master(state=[EC2InstanceState.Stopped]) if stopped_master: logger.info("Master %s successfully stopped", master_instance_id) return else: time.sleep(5)
def _update_cluster_bucket(self): bucket_name = AXClusterConfigPath(name_id=self._name_id).bucket() cluster_bucket = Cloud().get_bucket(bucket_name, aws_profile=self._aws_profile, region=self._aws_region) if not cluster_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(cluster_bucket.get_bucket_name())) logger.info("Created %s bucket ... DONE", cluster_bucket.get_bucket_name())
def delete_cluster_status_before_pause(self): logger.info("Deleting Argo cluster status before last pause ...") if not self._bucket.delete_object( key=self._s3_cluster_state_before_pause): raise AXPlatformException("Failed to delete {} information".format( self._s3_cluster_state_before_pause)) logger.info("Deleted Argo cluster status before last pause")
def delete_objects(self, objects): """ Stop kubernetes objects based on records. Wait for all of them. :param objects: AXPlatformObjectGroup """ assert isinstance(objects, AXPlatformObjectGroup) if not self._should_delete_group( policy=objects.policy, policy_predicate=objects.policy_predicate): logger.debug( "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)", objects.name, objects.policy, objects.policy_predicate) return logger.info("Delete step: %s", objects.name) logger.info("Deleting platform objects\n\n%s.", self._generate_object_summary(objects.object_set)) pool = ThreadPool(len(objects.object_set)) async_results = {} for obj in objects.object_set: assert isinstance(obj, AXPlatformObject) name = obj.name namespace = obj.namespace async_results[name] = pool.apply_async( self.stop_one, args=(name, ), kwds={"namespace": namespace}) pool.close() pool.join() report, failed = self._generate_report(async_results, "Delete") logger.info(report) if failed: raise AXPlatformException("Failed to create platform objects.")
def update(self, iam): """ Create all buckets in portal account. """ logger.info("Creating applatix-support and applatix-upgrade buckets ...") support_bucket = Cloud().get_bucket(AXSupportConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) upgrade_bucket = Cloud().get_bucket(AXUpgradeConfigPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) # Retry create while bucket is created is fine if not support_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(support_bucket.get_bucket_name())) # If policy is already there, we don't update if not support_bucket.get_policy(): logger.info("Argo support bucket policy does not exist, creating new one...") if not support_bucket.put_policy( policy=self._generate_bucket_policy_string(template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=support_bucket.get_bucket_name(), iam=iam) ): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format(support_bucket.get_bucket_name())) if not upgrade_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(support_bucket.get_bucket_name())) if not upgrade_bucket.get_policy(): logger.info("Argo upgrade bucket policy does not exist, creating new one...") if not upgrade_bucket.put_policy( policy=self._generate_bucket_policy_string(template=SUPPORT_BUCKET_POLICY_TEMPLATE, bucket_name=upgrade_bucket.get_bucket_name(), iam=iam) ): raise AXPlatformException( "Failed to configure policy for S3 bucket {}".format(support_bucket.get_bucket_name())) # Tag them right away to avoid race deletion. upgrade_bucket.put_object(key=AXUpgradeConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") support_bucket.put_object(key=AXSupportConfigPath(name_id=self._name_id).tag(), data="tag", ACL="bucket-owner-full-control") logger.info("Created %s and %s buckets ... DONE", support_bucket.get_bucket_name(), upgrade_bucket.get_bucket_name())
def get_main_container_name(self): if not hasattr(self, "containers"): self.build_attributes() for c in self.containers: if c["name"] not in [SIDEKICK_WAIT_CONTAINER_NAME, DIND_CONTAINER_NAME]: return c["name"] raise AXPlatformException("Pod for a task needs to have a non-wait container")
def delete_webhook(): webhook_svc_name = "axops-webhook" try: kubectl.api.delete_namespaced_service(namespace="axsys", name=webhook_svc_name) except ApiException as ae: if ae.status != 404: raise AXPlatformException("Unable to delete webhook", detail=str(ae)) return jsonify(result="ok")
def upload_cluster_metadata(self): logger.info("Uploading Argo cluster metadata ...") with open(self._metadata_file, "r") as f: data = f.read() # User pods should be able to curl it so we have to set ACL to public-read if not self._bucket.put_object(self._s3_cluster_meta, data, ACL="public-read"): raise AXPlatformException("Failed to upload cluster metadata for {}".format(self._cluster_name_id)) logger.info("Uploading Argo cluster metadata ... DONE")
def upload_cluster_current_state(self, state): logger.info("Uploading cluster current state ...") if not self._bucket.put_object(key=self._s3_cluster_current_state, data=state): raise AXPlatformException( "Failed to upload cluster current state info for {}".format( self._cluster_name_id)) logger.info("Uploading cluster current state ... DONE")
def mark_for_deletion(self): state = self._get_from_provider() if state is None: raise AXPlatformException("Missing volume for {}".format( self.name)) s = {"metadata": {"annotations": {"ax_deletion": "True"}}} self._update_in_provider(s)
def delete_staging_info(self, stage): assert stage in ["stage1", "stage2" ], "Only stage1, and stage2 information is available" logger.info("Deleting Argo install %s info from s3 ...", stage) if not self._bucket.delete_object(key=self._staging_info[stage]): raise AXPlatformException( "Failed to delete {} information".format(stage)) logger.info("Deleted Argo install %s info from s3 ...", stage)
def enable_artifacts(self, namespace, version, sid, in_artifacts_spec): if "main" not in self.cmap: raise AXPlatformException("Pod needs to have main and wait container before enabling artifacts") # Add an init container that gets artifacts and creates mappings in the main container c_setup = InitContainerSetup() customer_image = self.cmap["main"].image c_pullimage = InitContainerPullImage(customer_image) c_artifacts = InitContainerTask(customer_image, namespace, version) self.add_init_container(c_setup) self.add_init_container(c_pullimage) self.add_init_container(c_artifacts) # set the command in the main container and add volume mount self.cmap["main"].command = ["{}/executor.sh".format(ARTIFACTS_CONTAINER_SCRATCH_PATH)] artifacts_vol = c_artifacts.get_artifacts_volume() artifacts_vol.set_mount_path(ARTIFACTS_CONTAINER_SCRATCH_PATH) self.cmap["main"].add_volume(artifacts_vol) static_bins_vol = c_artifacts.get_static_bins_volume() static_bins_vol.set_mount_path("/ax-execu-host") self.cmap["main"].add_volume(static_bins_vol) def generate_volumes_for_artifacts(): test_mode = False if AXArtifacts.is_test_service_instance(sid): test_mode = True art_volumes = AXArtifacts.get_extra_artifact_in_volume_mapping( in_artifacts_spec, ARTIFACTS_CONTAINER_SCRATCH_PATH, "in", test_mode=test_mode, self_sid=sid) ret_vols = [] initc_vols = [] i = 0 already_mapped = {} for initc_path, mount_path in art_volumes or []: name = "ax-art-{}".format(i) c = ContainerVolume(name, mount_path) c.set_type("EMPTYDIR") c_init = ContainerVolume(name, initc_path) c_init.set_type("EMPTYDIR") i += 1 if mount_path not in already_mapped: ret_vols.append(c) initc_vols.append(c_init) already_mapped[mount_path] = True return ret_vols, initc_vols # Add artifacts to main container (self._artifact_vols, initc_vols) = generate_volumes_for_artifacts() self.cmap["main"].add_volumes(self._artifact_vols) c_artifacts.add_volumes(initc_vols) return c_artifacts
def stop(self, force=False): if not self.started: return if not force and not self.record.empty(): raise AXPlatformException("Waiter pending, need to force stop") self.record.clear_all() for m in self.monitors: m.request_stop() self.started = False
def single_executor(self, image_name, func, *args): # lock for global dict state = None self._lock.acquire() if image_name not in self.image_fetching_dict: logger.info( "DockerImageFetcher: %s not being pulled by another thread. Preparing to pull", image_name) cv = Condition() state = {"cv": cv, "status": False, "detail": "Unknown failure"} self.image_fetching_dict[image_name] = state self._lock.release() logger.info("DockerImageFetcher: %s is being fetched now", image_name) try: status = func(*args) state["status"] = status if status: state["detail"] = "Success" except Exception as e: logger.error("DockerImageFetcher: Got exception %s", e) state["detail"] = e self._lock.acquire() self.image_fetching_dict.pop(image_name) with cv: logger.debug("DockerImageFetcher: Notifying waiter of %s", image_name) self._lock.release() cv.notify_all() else: logger.info( "DockerImageFetcher: %s already being fetched by another thread", image_name) state = self.image_fetching_dict[image_name] cv = state["cv"] with cv: self._lock.release() logger.debug( "DockerImageFetcher: Waiting to be notified by another thread for %s", image_name) cv.wait() # At this time I have a reference to 'state' from which i can read # the status logger.debug( "DockerImageFetcher: Got notified from the other thread for %s State %s ", image_name, state) if not state["status"]: raise AXPlatformException( "Error: {}, while fetching image {}".format( state["detail"], image_name)) return True
def run(self): """ The main method for the MasterManager. """ logger.info("Running the MasterManager!") attr_str = self.cluster_info.get_master_config(USER_DATA_FILE_S3) if attr_str is not None: self.attributes = json.loads(attr_str) self.attributes['user_data_file'] = USER_DATA_FILE_S3 # Check if the master is running. Update the self.master_instance object. try: instance_id = self.discover_master() if instance_id is not None: self.master_instance = self.ec2.Instance(instance_id) logger.info("Master instance discovered: %s", self.master_instance.instance_id) # this will retry for a while and then throw an exception if master api server is unreachable self.check_master_api_server() if not self.attributes: # This is needed only for first startup when cluster is created. logger.debug("Populating attributes") self.populate_attributes() logger.debug("Saving master's config into S3") self.save_master_config(USER_DATA_FILE_NEW) logger.info("Master config uploaded to s3") except Exception as e: raise AXPlatformException("Failed to discover master: " + str(e)) while (True): if self.master_instance is not None: self.wait_for_termination() message = "Master instance with id " + \ self.master_instance.instance_id + " terminated. A " + \ "new master instance will be created. This should " + \ "take a few minutes" else: logger.info("Master not running") message = "Master instance not found" + \ "A new master instance will be created. This should " + \ "take a few minutes." self.send_notification(CODE_PLATFORM_ERROR, message) new_master = self.launch_new_master() self.master_instance = self.ec2.Instance(new_master.instance_id) logger.info("New master instance %s running", self.master_instance.instance_id) self.send_notification(CODE_PLATFORM_CRITICAL, "New master " + \ "instance with id {} started".format( self.master_instance.instance_id)) logger.info("Wait for {} minutes before running checks...".format( WAIT_TIME_POST_RESTART_MIN)) time.sleep(WAIT_TIME_POST_RESTART_MIN * const.SECONDS_PER_MINUTE) logger.info("Done waiting. Now back to checks")
def __init__(self, name, client=None): self.name = name if client is None: self._client = KubernetesApiClient(use_proxy=True) else: self._client = client self._registry_spec = None self._software_info = SoftwareInfo() if self._software_info.registry_is_private(): secret = KubeObjectConfigFile(DEFAULT_SECRET_YAML_PATH, {"REGISTRY_SECRETS": self._software_info.registry_secrets}) for obj in secret.get_swagger_objects(): if isinstance(obj, swagger_client.V1Secret): self._registry_spec = obj assert self._registry_spec, "Argo registry specification is missing" self._am_service_spec = None self._am_deployment_spec = None # AA-2471: Hack to add AXOPS_EXT_DNS to Application Manager elb = InternalRoute("axops", "axsys", client=self._client) elb_status = elb.status(with_loadbalancer_info=True)["loadbalancer"][0] if not elb_status: raise AXPlatformException("Could not get axops elb address {}".format(elb_status)) replacements = {"NAMESPACE": self._software_info.image_namespace, "VERSION": self._software_info.image_version, "REGISTRY": self._software_info.registry, "APPLICATION_NAME": self.name, "AXOPS_EXT_DNS": elb_status} cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None) assert cluster_name_id, "Cluster name id is None!" cluster_config = AXClusterConfig(cluster_name_id=cluster_name_id) if not cluster_config.get_cluster_provider().is_user_cluster(): axam_path = DEFAULT_AM_YAML_PATH else: axam_path = "/ax/config/service/argo-all/axam-svc.yml.in" replacements["ARGO_DATA_BUCKET_NAME"] = os.getenv("ARGO_DATA_BUCKET_NAME") logger.info("Using replacements: %s", replacements) k = KubeObjectConfigFile(axam_path, replacements) for obj in k.get_swagger_objects(): if isinstance(obj, swagger_client.V1Service): self._am_service_spec = obj elif isinstance(obj, swagger_client.V1beta1Deployment): self._am_deployment_spec = obj self._add_pod_metadata("deployment", self._am_deployment_spec.metadata.name, is_label=True) self._add_pod_metadata("ax_costid", json.dumps({ "app": self.name, "service": "axam-deployment", "user": "******" })) else: logger.debug("Ignoring specification of type {}".format(type(obj))) assert self._am_service_spec and self._am_deployment_spec, "Application monitor specification is missing"
def search(self, searchstr=None): if searchstr is None or searchstr == "": raise AXPlatformException( "Docker hub search string needs to a non-empty string") response = self._conn.search(searchstr) return [{ "ctime": "", "repo": x['name'], "tag": "latest" } for x in response or []]
def _update_data_bucket(self): data_bucket = Cloud().get_bucket(AXClusterDataPath(name_id=self._name_id).bucket(), aws_profile=self._aws_profile, region=self._aws_region) if not data_bucket.create(): raise AXPlatformException("Failed to create S3 bucket {}".format(data_bucket.get_bucket_name())) # Update CORS config for data bucket too. logger.info("Checking CORS config for %s.", data_bucket.get_bucket_name()) data_bucket.put_cors(DATA_CORS_CONFIG) logger.info("Created %s bucket ... DONE", data_bucket.get_bucket_name())
def modify_asg(self, min, max): logger.info("Modifying autoscaling group ...") asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() if not asg: raise AXPlatformException( "Failed to get variable autoscaling group for cluster {}". format(self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] try: asg_manager.set_asg_spec(name=asg_name, minsize=1, maxsize=max) except ClientError as ce: raise AXPlatformException( "Failed to set cluster's variable autoscaling group min/max. Error: {}" .format(ce)) logger.info("Modifying cluster autoscaling group ... DONE")
def upload_staging_info(self, stage, msg): assert stage in ["stage1", "stage2" ], "Only stage1, and stage2 information is available" logger.info("Uploading Argo install %s info to s3 ...", stage) if not self._bucket.put_object(key=self._staging_info[stage], data=msg): raise AXPlatformException( "Failed to upload Argo install {} info for {}".format( stage, self._cluster_name_id)) logger.info("Uploading Argo install %s info %s to s3 ... DONE", stage, msg)
def add_ref(self, ref, exclusive=False): """ Add a reference to the EBS volume Args: ref: string exclusive: Boolean for exclusive access Returns: the array of refs after addition of ref """ state = self._get_from_provider() if state is None: raise AXPlatformException("Missing volume for {}".format( self.name)) if state.metadata.annotations.get("ax_deletion", "False") == "True": raise AXVolumeException( "Cannot add ref to a volume that is marked for deletion") # refs is an array of ref refs_str = state.metadata.annotations['ax_refs'] refs = ast.literal_eval(refs_str) curr_excl = state.metadata.annotations['ax_exclusive'] == 'True' single_ref = len(refs) == 1 ref_exists = ref in refs if len(refs) == 0: # trivially add ref refs.append(ref) self._add_state_and_update(exclusive, refs) return refs if exclusive: if single_ref and ref_exists: if not curr_excl: # update to exclusive refs.append(ref) self._add_state_and_update(exclusive, refs) return refs else: raise AXVolumeException( "Cannot lock volume {} for ref {} in state [Exclusive {} Refs {}]" .format(self.name, ref, curr_excl, refs)) else: if not curr_excl: if not ref_exists: refs.append(ref) self._add_state_and_update(exclusive, refs) return refs else: raise AXVolumeException( "Cannot add ref {} to volume {} as it is state [Exclusive {} Refs {}" .format(ref, self.name, curr_excl, refs))