def ListDeployments(self, request, context=None): try: namespace = request.namespace or self.default_namespace deployment_pb_list = self.deployment_store.list( namespace=namespace, labels_query=request.labels_query, offset=request.offset, limit=request.limit, operator=request.operator, order_by=request.order_by, ascending_order=request.ascending_order, ) return ListDeploymentsResponse(status=Status.OK(), deployments=deployment_pb_list) except BentoMLException as e: logger.error("RPC ERROR ListDeployments: %s", e) return DeleteDeploymentResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR ListDeployments: %s", e) return DeleteDeploymentResponse(status=Status.INTERNAL())
def ListBento(self, request, context=None): try: # TODO: validate request bento_metadata_pb_list = self.bento_metadata_store.list( bento_name=request.bento_name, offset=request.offset, limit=request.limit, order_by=request.order_by, label_selectors=request.label_selectors, ascending_order=request.ascending_order, ) return ListBentoResponse( status=Status.OK(), bentos=bento_metadata_pb_list ) except BentoMLException as e: logger.error("RPC ERROR ListBento: %s", e) return ListBentoResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR ListBento: %s", e) return ListBentoResponse(status=Status.INTERNAL())
def GetDeployment(self, request, context=None): try: request.namespace = request.namespace or self.default_namespace deployment_pb = self.deployment_store.get( request.deployment_name, request.namespace ) if deployment_pb: return GetDeploymentResponse( status=Status.OK(), deployment=deployment_pb ) else: return GetDeploymentResponse( status=Status.NOT_FOUND( 'Deployment "{}" in namespace "{}" not found'.format( request.deployment_name, request.namespace ) ) ) except BentoMLException as e: logger.error("INTERNAL ERROR: %s", e) return GetDeploymentResponse(status=Status.INTERNAL(str(e)))
def GetBento(self, request, context=None): with self.db.create_session() as sess: try: # TODO: validate request bento_pb = self.db.metadata_store.get( sess, request.bento_name, request.bento_version ) if bento_pb: if request.bento_version.lower() == 'latest': logger.info( 'Getting latest version %s:%s', request.bento_name, bento_pb.version, ) if bento_pb.uri.type == BentoUri.S3: bento_pb.uri.s3_presigned_url = self.repo.get( bento_pb.name, bento_pb.version ) elif bento_pb.uri.type == BentoUri.GCS: bento_pb.uri.gcs_presigned_url = self.repo.get( bento_pb.name, bento_pb.version ) return GetBentoResponse(status=Status.OK(), bento=bento_pb) else: return GetBentoResponse( status=Status.NOT_FOUND( "BentoService `{}:{}` is not found".format( request.bento_name, request.bento_version ) ) ) except BentoMLException as e: logger.error("RPC ERROR GetBento: %s", e) return GetBentoResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR GetBento: %s", e) return GetBentoResponse(status=Status.INTERNAL())
def DescribeDeployment(self, request, context=None): try: request.namespace = request.namespace or self.default_namespace deployment_pb = self.deployment_store.get(request.deployment_name, request.namespace) if deployment_pb: operator = get_deployment_operator(deployment_pb) response = operator.describe(deployment_pb, self.repo) if response.status.status_code == status_pb2.Status.OK: with self.deployment_store.update_deployment( request.deployment_name, request.namespace) as deployment: deployment.state = ProtoMessageToDict(response.state) return response else: return DescribeDeploymentResponse(status=Status.NOT_FOUND( 'Deployment "{}" in namespace "{}" not found'.format( request.deployment_name, request.namespace))) except BentoMLException as e: logger.error("INTERNAL ERROR: %s", e) return DescribeDeploymentResponse(Status.INTERNAL(str(e)))
def DescribeDeployment(self, request, context=None): deployment_id = f"{request.deployment_name}_{request.namespace}" with lock(self.db, [(deployment_id, LockType.READ)]) as ( sess, _, ): try: request.namespace = request.namespace or self.default_namespace deployment_pb = self.db.deployment_store.get( sess, request.deployment_name, request.namespace) if deployment_pb: operator = get_deployment_operator(self, deployment_pb) response = operator.describe(deployment_pb) if response.status.status_code == status_pb2.Status.OK: with self.db.deployment_store.update_deployment( sess, request.deployment_name, request.namespace) as deployment: deployment.state = ProtoMessageToDict( response.state) return response else: return DescribeDeploymentResponse( status=Status.NOT_FOUND( 'Deployment "{}" in namespace "{}" not found'. format(request.deployment_name, request.namespace))) except BentoMLException as e: logger.error("RPC ERROR DescribeDeployment: %s", e) return DeleteDeploymentResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR DescribeDeployment: %s", e) return DeleteDeploymentResponse(status=Status.INTERNAL())
def GetDeployment(self, request, context=None): deployment_id = f"{request.deployment_name}_{request.namespace}" with lock(self.db, [(deployment_id, LockType.READ)]) as ( sess, _, ): try: request.namespace = request.namespace or self.default_namespace deployment_pb = self.db.deployment_store.get( sess, request.deployment_name, request.namespace) if deployment_pb: return GetDeploymentResponse(status=Status.OK(), deployment=deployment_pb) else: return GetDeploymentResponse(status=Status.NOT_FOUND( 'Deployment "{}" in namespace "{}" not found'. format(request.deployment_name, request.namespace))) except BentoMLException as e: logger.error("RPC ERROR GetDeployment: %s", e) return GetDeploymentResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR GetDeployment: %s", e) return GetDeploymentResponse(status=Status.INTERNAL())
def describe(self, deployment_pb): try: deployment_spec = deployment_pb.spec lambda_deployment_config = deployment_spec.aws_lambda_operator_config lambda_deployment_config.region = (lambda_deployment_config.region or get_default_aws_region()) if not lambda_deployment_config.region: raise InvalidArgument('AWS region is missing') bento_pb = self.yatai_service.GetBento( GetBentoRequest( bento_name=deployment_spec.bento_name, bento_version=deployment_spec.bento_version, )) bento_service_metadata = bento_pb.bento.bento_service_metadata api_names = ([lambda_deployment_config.api_name] if lambda_deployment_config.api_name else [api.name for api in bento_service_metadata.apis]) try: cf_client = boto3.client('cloudformation', lambda_deployment_config.region) stack_name = generate_aws_compatible_string( '{ns}-{name}'.format(ns=deployment_pb.namespace, name=deployment_pb.name)) cloud_formation_stack_result = cf_client.describe_stacks( StackName=stack_name) stack_result = cloud_formation_stack_result.get('Stacks')[0] # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/\ # using-cfn-describing-stacks.html success_status = ['CREATE_COMPLETE', 'UPDATE_COMPLETE'] if stack_result['StackStatus'] in success_status: if stack_result.get('Outputs'): outputs = stack_result['Outputs'] else: return DescribeDeploymentResponse( status=Status.ABORTED( '"Outputs" field is not present'), state=DeploymentState( state=DeploymentState.ERROR, error_message='"Outputs" field is not present', ), ) elif stack_result[ 'StackStatus'] in FAILED_CLOUDFORMATION_STACK_STATUS: state = DeploymentState(state=DeploymentState.FAILED) state.timestamp.GetCurrentTime() return DescribeDeploymentResponse(status=Status.OK(), state=state) else: state = DeploymentState(state=DeploymentState.PENDING) state.timestamp.GetCurrentTime() return DescribeDeploymentResponse(status=Status.OK(), state=state) except Exception as error: # pylint: disable=broad-except state = DeploymentState(state=DeploymentState.ERROR, error_message=str(error)) state.timestamp.GetCurrentTime() return DescribeDeploymentResponse(status=Status.INTERNAL( str(error)), state=state) outputs = {o['OutputKey']: o['OutputValue'] for o in outputs} info_json = {} if 'EndpointUrl' in outputs: info_json['endpoints'] = [ outputs['EndpointUrl'] + '/' + api_name for api_name in api_names ] if 'S3Bucket' in outputs: info_json['s3_bucket'] = outputs['S3Bucket'] state = DeploymentState(state=DeploymentState.RUNNING, info_json=json.dumps(info_json)) state.timestamp.GetCurrentTime() return DescribeDeploymentResponse(status=Status.OK(), state=state) except BentoMLException as error: return DescribeDeploymentResponse(status=error.status_proto)
def create_deployment( deployment_name, namespace, bento_name, bento_version, platform, operator_spec, labels=None, annotations=None, yatai_service=None, ): if yatai_service is None: from bentoml.yatai import get_yatai_service yatai_service = get_yatai_service() try: # Make sure there is no active deployment with the same deployment name get_deployment_pb = yatai_service.GetDeployment( GetDeploymentRequest(deployment_name=deployment_name, namespace=namespace) ) if get_deployment_pb.status.status_code == status_pb2.Status.OK: raise BentoMLDeploymentException( 'Deployment "{name}" already existed, use Update or Apply for updating' 'existing deployment, or create the deployment with a different name or' 'under a different deployment namespace'.format(name=deployment_name) ) if get_deployment_pb.status.status_code != status_pb2.Status.NOT_FOUND: raise BentoMLDeploymentException( 'Failed accesing YataiService deployment store. {error_code}:' '{error_message}'.format( error_code=Status.Name(get_deployment_pb.status.status_code), error_message=get_deployment_pb.status.error_message, ) ) deployment_dict = { "name": deployment_name, "namespace": namespace or config().get('deployment', 'default_namespace'), "labels": labels, "annotations": annotations, "spec": { "bento_name": bento_name, "bento_version": bento_version, "operator": platform, }, } operator = platform.replace('-', '_').upper() try: operator_value = DeploymentSpec.DeploymentOperator.Value(operator) except ValueError: return ApplyDeploymentResponse( status=Status.INVALID_ARGUMENT('Invalid platform "{}"'.format(platform)) ) if operator_value == DeploymentSpec.AWS_SAGEMAKER: deployment_dict['spec']['sagemaker_operator_config'] = { 'region': operator_spec.get('region') or config().get('aws', 'default_region'), 'instance_count': operator_spec.get('instance_count') or config().getint('sagemaker', 'default_instance_count'), 'instance_type': operator_spec.get('instance_type') or config().get('sagemaker', 'default_instance_type'), 'api_name': operator_spec.get('api_name', ''), } elif operator_value == DeploymentSpec.AWS_LAMBDA: deployment_dict['spec']['aws_lambda_operator_config'] = { 'region': operator_spec.get('region') or config().get('aws', 'default_region') } if operator_spec.get('api_name'): deployment_dict['spec']['aws_lambda_operator_config'][ 'api_name' ] = operator_spec['api_name'] elif operator_value == DeploymentSpec.GCP_FCUNTION: deployment_dict['spec']['gcp_function_operatorConfig'] = { 'region': operator_spec.get('region') or config().get('google-cloud', 'default_region') } if operator_spec.get('api_name'): deployment_dict['spec']['gcp_function_operator_config'][ 'api_name' ] = operator_spec['api_name'] elif operator_value == DeploymentSpec.KUBERNETES: deployment_dict['spec']['kubernetes_operator_config'] = { 'kube_namespace': operator_spec.get('kube_namespace', ''), 'replicas': operator_spec.get('replicas', 0), 'service_name': operator_spec.get('service_name', ''), 'service_type': operator_spec.get('service_type', ''), } else: raise BentoMLDeploymentException( 'Platform "{}" is not supported in the current version of ' 'BentoML'.format(platform) ) return apply_deployment(deployment_dict, yatai_service) except BentoMLException as error: return ApplyDeploymentResponse(status=Status.INTERNAL(str(error)))
def ApplyDeployment(self, request, context=None): try: # apply default namespace if not set request.deployment.namespace = (request.deployment.namespace or self.default_namespace) validation_errors = validate_deployment_pb(request.deployment) if validation_errors: raise InvalidArgument( 'Failed to validate deployment. {errors}'.format( errors=validation_errors)) previous_deployment = self.deployment_store.get( request.deployment.name, request.deployment.namespace) if not previous_deployment: request.deployment.created_at.GetCurrentTime() request.deployment.last_updated_at.GetCurrentTime() self.deployment_store.insert_or_update(request.deployment) # find deployment operator based on deployment spec operator = get_deployment_operator(self, request.deployment) # deploying to target platform if previous_deployment: response = operator.update(request.deployment, previous_deployment) else: response = operator.add(request.deployment) if response.status.status_code == status_pb2.Status.OK: # update deployment state if response and response.deployment: self.deployment_store.insert_or_update(response.deployment) else: raise BentoMLException( "DeploymentOperator Internal Error: failed to add or update " "deployment metadata to database") logger.info( "ApplyDeployment (%s, namespace %s) succeeded", request.deployment.name, request.deployment.namespace, ) else: if not previous_deployment: # When failed to create the deployment, delete it from active # deployments records self.deployment_store.delete(request.deployment.name, request.deployment.namespace) logger.debug( "ApplyDeployment (%s, namespace %s) failed: %s", request.deployment.name, request.deployment.namespace, response.status.error_message, ) return response except BentoMLException as e: logger.error("RPC ERROR ApplyDeployment: %s", e) return ApplyDeploymentResponse(status=e.status_proto) except Exception as e: logger.error("URPC ERROR ApplyDeployment: %s", e) return ApplyDeploymentResponse(status=Status.INTERNAL(str(e)))
def DeleteDeployment(self, request, context=None): try: request.namespace = request.namespace or self.default_namespace deployment_pb = self.deployment_store.get( request.deployment_name, request.namespace ) if deployment_pb: # find deployment operator based on deployment spec operator = get_deployment_operator(self, deployment_pb) # executing deployment deletion response = operator.delete(deployment_pb) # if delete successful, remove it from active deployment records # table if response.status.status_code == status_pb2.Status.OK: track_deployment_delete( deployment_pb.spec.operator, deployment_pb.created_at ) self.deployment_store.delete( request.deployment_name, request.namespace ) return response # If force delete flag is True, we will remove the record # from yatai database. if request.force_delete: # Track deployment delete before it vanquishes from deployment # store track_deployment_delete( deployment_pb.spec.operator, deployment_pb.created_at, True ) self.deployment_store.delete( request.deployment_name, request.namespace ) return DeleteDeploymentResponse(status=Status.OK()) if response.status.status_code == status_pb2.Status.NOT_FOUND: modified_message = ( 'Cloud resources not found, error: {} - it may have been ' 'deleted manually. Try delete deployment ' 'with "--force" option to ignore this error ' 'and force deleting the deployment record'.format( response.status.error_message ) ) response.status.error_message = modified_message return response else: return DeleteDeploymentResponse( status=Status.NOT_FOUND( 'Deployment "{}" in namespace "{}" not found'.format( request.deployment_name, request.namespace ) ) ) except BentoMLException as e: logger.error("RPC ERROR DeleteDeployment: %s", e) return DeleteDeploymentResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR DeleteDeployment: %s", e) return DeleteDeploymentResponse(status=Status.INTERNAL(str(e)))
def describe(self, deployment_pb): try: deployment_spec = deployment_pb.spec ec2_deployment_config = deployment_spec.aws_ec2_operator_config ec2_deployment_config.region = (ec2_deployment_config.region or get_default_aws_region()) if not ec2_deployment_config.region: raise InvalidArgument("AWS region is missing") bento_pb = self.yatai_service.GetBento( GetBentoRequest( bento_name=deployment_spec.bento_name, bento_version=deployment_spec.bento_version, )) bento_service_metadata = bento_pb.bento.bento_service_metadata api_names = [api.name for api in bento_service_metadata.apis] deployment_stack_name = generate_aws_compatible_string( "btml-stack-{namespace}-{name}".format( namespace=deployment_pb.namespace, name=deployment_pb.name)) try: cf_client = boto3.client("cloudformation", ec2_deployment_config.region) cloudformation_stack_result = cf_client.describe_stacks( StackName=deployment_stack_name) stack_result = cloudformation_stack_result.get("Stacks")[0] if stack_result.get("Outputs"): outputs = stack_result.get("Outputs") else: return DescribeDeploymentResponse( status=Status.ABORTED( '"Outputs" field is not present'), state=DeploymentState( state=DeploymentState.ERROR, error_message='"Outputs" field is not present', ), ) if stack_result[ "StackStatus"] in FAILED_CLOUDFORMATION_STACK_STATUS: state = DeploymentState(state=DeploymentState.FAILED) return DescribeDeploymentResponse(status=Status.OK(), state=state) except Exception as error: # pylint: disable=broad-except state = DeploymentState(state=DeploymentState.ERROR, error_message=str(error)) return DescribeDeploymentResponse(status=Status.INTERNAL( str(error)), state=state) info_json = {} outputs = {o["OutputKey"]: o["OutputValue"] for o in outputs} if "AutoScalingGroup" in outputs: info_json[ "InstanceDetails"] = get_instance_ip_from_scaling_group( [outputs["AutoScalingGroup"]], ec2_deployment_config.region) info_json["Endpoints"] = get_endpoints_from_instance_address( info_json["InstanceDetails"], api_names) if "S3Bucket" in outputs: info_json["S3Bucket"] = outputs["S3Bucket"] if "TargetGroup" in outputs: info_json["TargetGroup"] = outputs["TargetGroup"] if "Url" in outputs: info_json["Url"] = outputs["Url"] healthy_target = get_healthy_target(outputs["TargetGroup"], ec2_deployment_config.region) if healthy_target: deployment_state = DeploymentState.RUNNING else: deployment_state = DeploymentState.PENDING state = DeploymentState(state=deployment_state, info_json=json.dumps(info_json)) return DescribeDeploymentResponse(status=Status.OK(), state=state) except BentoMLException as error: return DescribeDeploymentResponse(status=error.status_proto)
def ContainerizeBento(self, request, context=None): bento_id = f"{request.bento_name}_{request.bento_version}" with lock(self.db, [(bento_id, LockType.READ)]) as (sess, _): try: ensure_docker_available_or_raise() tag = request.tag if tag is None or len(tag) == 0: name = to_valid_docker_image_name(request.bento_name) version = to_valid_docker_image_version( request.bento_version) tag = f"{name}:{version}" if ":" not in tag: version = to_valid_docker_image_version( request.bento_version) tag = f"{tag}:{version}" import docker docker_client = docker.from_env() bento_pb = self.db.metadata_store.get( sess, request.bento_name, request.bento_version) if not bento_pb: raise YataiRepositoryException( f'BentoService ' f'{request.bento_name}:{request.bento_version} ' f'does not exist') with TempDirectory() as temp_dir: temp_bundle_path = f'{temp_dir}/{bento_pb.name}' bento_service_bundle_path = bento_pb.uri.uri if bento_pb.uri.type == BentoUri.S3: bento_service_bundle_path = self.repo.get( bento_pb.name, bento_pb.version) elif bento_pb.uri.type == BentoUri.GCS: bento_service_bundle_path = self.repo.get( bento_pb.name, bento_pb.version) safe_retrieve(bento_service_bundle_path, temp_bundle_path) try: docker_client.images.build( path=temp_bundle_path, tag=tag, buildargs=dict(request.build_args), ) except ( docker.errors.APIError, docker.errors.BuildError, ) as error: logger.error( f'Encounter container building issue: {error}') raise YataiRepositoryException(error) if request.push is True: try: docker_client.images.push( repository=request.repository, tag=tag) except docker.errors.APIError as error: raise YataiRepositoryException(error) return ContainerizeBentoResponse(status=Status.OK(), tag=tag) except BentoMLException as e: logger.error(f"RPC ERROR ContainerizeBento: {e}") return ContainerizeBentoResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error(f"RPC ERROR ContainerizeBento: {e}") return ContainerizeBentoResponse(status=Status.INTERNAL(e))
def UploadBento(self, request_iterator, context=None): if not is_file_system_repo(self.repo): logger.error( "UploadBento RPC only works with File System based repository, " "for other types of repositories(s3, gcs, minio), " "use pre-signed URL for upload") return UploadBentoResponse(status=Status.INTERNAL('')) try: with self.db.create_session() as sess: lock_obj = None bento_pb = None with TempDirectory() as temp_dir: temp_tar_path = os.path.join( temp_dir, f'{uuid.uuid4().hex[:12]}.tar') file = open(temp_tar_path, 'wb+') for request in request_iterator: # Initial request is without bundle if not request.bento_bundle: bento_name = request.bento_name bento_version = request.bento_version bento_pb = self.db.metadata_store.get( sess, bento_name, bento_version) if not bento_pb: result_status = Status.NOT_FOUND( "BentoService `{}:{}` is not found". format(bento_name, bento_version)) return UploadBentoResponse( status=result_status) if bento_pb.status: if bento_pb.status.status == UploadStatus.DONE: return UploadBentoResponse( status=Status.CANCELLED( f"Bento bundle `{bento_name}:" f"{bento_version}` is uploaded" )) if bento_pb.status.status == UploadStatus.UPLOADING: return UploadBentoResponse( status=Status.CANCELLED( f"Bento bundle `{bento_name}:" f"{bento_version}` is currently " f"uploading")) if lock_obj is None: lock_obj = LockStore.acquire( sess=sess, lock_type=LockType.WRITE, resource_id= f'{bento_name}_{bento_version}', ttl_min=DEFAULT_TTL_MIN, ) else: if (bento_name == request.bento_name and bento_version == request.bento_version): file.write(request.bento_bundle) else: lock_obj.release(sess) raise BadInput( f"Incoming stream request doesn't match " f"with initial request info " f"{bento_name}:{bento_version} - " f"{request.bento_name}:" f"{request.bento_version}") file.seek(0) with tarfile.open(fileobj=file, mode='r') as tar: tar.extractall(path=bento_pb.uri.uri) upload_status = UploadStatus(status=UploadStatus.DONE) upload_status.updated_at.GetCurrentTime() self.db.metadata_store.update_upload_status( sess, bento_name, bento_version, upload_status) lock_obj.release(sess) return UploadBentoResponse(status=Status.OK()) except BentoMLException as e: logger.error("RPC ERROR UploadBento: %s", e) return UploadBentoResponse(status=e.status_proto) except Exception as e: # pylint: disable=broad-except logger.error("RPC ERROR UploadBento: %s", e) return UploadBentoResponse(status=Status.INTERNAL()) finally: if file is not None: file.close()