Beispiel #1
0
def delete_model_from_aip_if_exists(
    api: discovery.Resource,
    ai_platform_serving_args: Dict[Text, Any],
) -> None:
    """Deletes a model from Google Cloud AI Platform if exists.

  Args:
    api: Google API client resource.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform. For the full set of parameters supported, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models

  Raises:
    RuntimeError: if an error is encountered when trying to delete.
  """
    logging.info('Deleting model with from AI Platform: %s',
                 ai_platform_serving_args)
    model_name = ai_platform_serving_args['model_name']
    project_id = ai_platform_serving_args['project_id']
    name = 'projects/{}/models/{}'.format(project_id, model_name)
    try:
        operation = api.projects().models().delete(name=name).execute()
        _wait_for_operation(api, operation, 'projects.models.delete')
    except errors.HttpError as e:
        # If the error is to delete an non-exist model, it's ok to ignore.
        if e.resp.status == 404:
            logging.warn('Model %s does not exist', model_name)
        else:
            raise RuntimeError(
                'Deleting model from AI Platform failed: {}'.format(e))
Beispiel #2
0
def _wait_for_operation(api: discovery.Resource, operation: Dict[Text, Any],
                        method_name: Text) -> Dict[Text, Any]:
    """Wait for a long running operation.

  Args:
    api: Google API client resource.
    operation: The operation to wait for.
    method_name: Operation method name for logging.

  Returns:
    Operation completion status.

  Raises:
    RuntimeError: If the operation completed with an error.
  """
    status_resc = api.projects().operations().get(name=operation['name'])
    while not status_resc.execute().get('done'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        logging.info('Method %s still being executed...', method_name)
    result = status_resc.execute()
    if result.get('error'):
        # The operation completed with an error.
        raise RuntimeError('Failed to execute {}: {}'.format(
            method_name, result['error']))
    return result
Beispiel #3
0
def get_zone_tpu_types(tpu_api: discovery.Resource, project_id: str,
                       zone: str) -> Optional[List[TPUSpec]]:
    """gets list of tpus available in given zone

  Args:
  tpu_api: tpu api instance
  project_id: project id
  zone: zone string

  Returns:
  list of supported tpu specs on success, None otherwise
  """

    location = 'projects/{}/locations/{}'.format(project_id, zone)
    rsp = tpu_api.projects().locations().acceleratorTypes().list(
        parent=location).execute()

    tpus = []
    for t in rsp['acceleratorTypes']:
        spec = gke_tpu_to_tpuspec(t['type'])
        if spec is None:
            continue
        tpus.append(spec)

    return tpus
Beispiel #4
0
def get_gke_clusters(container: Resource, project_id: str) -> Dict:
    """
    Returns a GCP response object containing a list of GKE clusters within the given project.

    :type container: The GCP Container resource object
    :param container: The Container resource object created by googleapiclient.discovery.build()

    :type project_id: str
    :param project_id: The Google Project Id that you are retrieving clusters from

    :rtype: Cluster Object
    :return: Cluster response object
    """
    try:
        req = container.projects().zones().clusters().list(projectId=project_id, zone='-')
        res = req.execute()
        return res
    except HttpError as e:
        err = json.loads(e.content.decode('utf-8'))['error']
        if err['status'] == 'PERMISSION_DENIED':
            logger.warning(
                (
                    "Could not retrieve GKE clusters on project %s due to permissions issue. Code: %s, Message: %s"
                ), project_id, err['code'], err['message'],
            )
            return {}
        else:
            raise
Beispiel #5
0
def _get_study(
    service_client: discovery.Resource,
    study_parent: Text,
    study_id: Text,
    study_should_exist: bool = False,
):
    """Method for loading a study.

    Given the study_parent and the study_id, this method will load the specified
    study, up to constants.MAX_NUM_TRIES_FOR_STUDIES tries.

    Args:
        service_client: An API client of Vizier service.
        study_parent: Prefix of the study name. The full study name will be
            {study_parent}/studies/{study_id}.
        study_id: An identifier of the study.
        study_should_exist: Indicates whether it should be assumed that the
            study with the given study_id exists.
    """
    study_name = "{}/studies/{}".format(study_parent, study_id)
    tf.get_logger().info(
        "Study already exists: {}.\nLoad existing study...".format(study_name))
    num_tries = 0
    while True:
        try:
            service_client.projects().locations().studies().get(
                name=study_name
            ).execute()
        except errors.HttpError as err:
            num_tries += 1
            if num_tries >= constants.MAX_NUM_TRIES_FOR_STUDIES:
                if (
                    study_should_exist
                    and err.resp.status == http.HTTPStatus.NOT_FOUND.value
                ):
                    raise ValueError(
                        "GetStudy failed. Study not found: {}.".format(study_id)
                    )
                else:
                    raise RuntimeError(
                        "GetStudy failed. Max retries reached: {0!s}".format(
                            err
                        )
                    )
            time.sleep(1)  # wait 1 second before trying to get the study again
        else:
            break
Beispiel #6
0
 def __read_papi_v2beta_operation_metadata(
         operation_id: str,
         genomics_v2beta_client: Resource) -> Mapping[str, Any]:
     """Reads the operations metadata for a pipelines API v2beta job ID. Returns a python dict"""
     logger.info(
         f'Reading PAPI v2beta operation metadata for {operation_id}...')
     result = genomics_v2beta_client.projects().locations().operations(
     ).get(name=operation_id).execute()
     return result
Beispiel #7
0
def get_service_account(project_id: str, service_account_email: str,
                        iam_service: discovery.Resource) -> Dict:
    """Summary
    Args:
        project_id (str): Description
        service_account_email (str): Description
        iam_service (discovery.Resource): Description
    Returns:
        Dict: Description
    """
    return iam_service.projects().serviceAccounts().get(
        name=f"projects/{project_id}/serviceAccounts/{service_account_email}"
    ).execute()
Beispiel #8
0
def create_model_for_aip_prediction_if_not_exist(
    api: discovery.Resource,
    job_labels: Dict[Text, Text],
    ai_platform_serving_args: Dict[Text, Any],
) -> bool:
    """Creates a new model for serving with AI Platform if not exists.

  Args:
    api: Google API client resource.
    job_labels: The dict of labels that will be attached to this job.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform.

  Returns:
    Whether a new model is created.

  Raises:
    RuntimeError if model creation failed.
  """

    model_name = ai_platform_serving_args['model_name']
    project_id = ai_platform_serving_args['project_id']
    regions = ai_platform_serving_args.get('regions', [])
    body = {'name': model_name, 'regions': regions, 'labels': job_labels}
    parent = 'projects/{}'.format(project_id)
    result = True
    try:
        api.projects().models().create(body=body, parent=parent).execute()
    except errors.HttpError as e:
        # If the error is to create an already existing model, it's ok to ignore.
        if e.resp.status == 409:
            logging.warn('Model %s already exists', model_name)
            result = False
        else:
            raise RuntimeError(
                'Creating model to AI Platform failed: {}'.format(e))
    return result
Beispiel #9
0
def create_service_account_key(service_account: Dict,
                               iam_service: discovery.Resource) -> Dict:
    """Summary
    Args:
        service_account (Dict): Description
        iam_service (discovery.Resource): Description
    Returns:
        Dict: Description
    """
    return iam_service.projects().serviceAccounts().keys().create(
        name=service_account["name"],
        body={
            "privateKeyType": "TYPE_GOOGLE_CREDENTIALS_FILE",
            "keyAlgorithm": "KEY_ALG_RSA_2048"
        }).execute()
Beispiel #10
0
def get_project_policies(project_id: str,
                         resource_manager_service: discovery.Resource) -> Dict:
    """Summary
    Args:
        project_id (str): Description
        resource_manager_service (discovery.Resource): Description
    Returns:
        Dict: Description
    """
    return resource_manager_service.projects().getIamPolicy(
        resource=project_id, body={
            "options": {
                "requestedPolicyVersion": 3
            }
        }).execute()
Beispiel #11
0
def create_service_account(project_id: str, name: str, display_name: str,
                           iam_service: discovery.Resource) -> Dict:
    """Summary
    Args:
        project_id (str): Description
        name (str): Description
        display_name (str): Description
        iam_service (discovery.Resource): Description
    Returns:
        Dict: Description
    """
    return iam_service.projects().serviceAccounts().create(
        name='projects/' + project_id,
        body={
            'accountId': name,
            'serviceAccount': {
                'displayName': display_name
            }
        }).execute()
Beispiel #12
0
def delete_model_version_from_aip_if_exists(
    api: discovery.Resource,
    model_version: Text,
    ai_platform_serving_args: Dict[Text, Any],
) -> None:
    """Deletes a model version from Google Cloud AI Platform if version exists.

  Args:
    api: Google API client resource.
    model_version: Version of the model being deleted.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform. For the full set of parameters supported, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models

  Raises:
    RuntimeError: if an error is encountered when trying to delete.
  """
    logging.info('Deleting model version %s from AI Platform: %s',
                 model_version, ai_platform_serving_args)
    model_name = ai_platform_serving_args['model_name']
    project_id = ai_platform_serving_args['project_id']
    version_name = 'projects/{}/models/{}/versions/{}'.format(
        project_id, model_name, model_version)
    try:
        operation = api.projects().models().versions().delete(
            name=version_name).execute()
        _wait_for_operation(api, operation, 'projects.models.versions.delete')
    except errors.HttpError as e:
        # If the error is to delete an non-exist model version, it's ok to ignore.
        if e.resp.status == 404:
            logging.warn('Model version %s does not exist', version_name)
        if e.resp.status == 400:
            logging.warn(
                'Model version %s won\'t be deleted because it is the '
                'default version and not the only version in the model',
                version_name)
        else:
            raise RuntimeError(
                'Deleting model version {} from AI Platform failed: {}'.format(
                    version_name, e))
Beispiel #13
0
def add_service_account_policy(
        project_id: str, service_account: Dict,
        resource_manager_service: discovery.Resource) -> Dict:
    """Summary
    Args:
        project_id (str): Description
        service_account (Dict): Description
        resource_manager_service (discovery.Resource): Description
    Returns:
        Dict: Description
    """
    existing_policies = get_project_policies(project_id,
                                             resource_manager_service)
    new_policies = existing_policies
    new_policies["bindings"] += [{
        "role":
        "roles/storage.objectAdmin",
        "members": [f'serviceAccount:{service_account["email"]}']
    }]
    return resource_manager_service.projects().setIamPolicy(
        resource=f'{project_id}', body={
            "policy": new_policies
        }).execute()
Beispiel #14
0
def get_tpu_drivers(tpu_api: discovery.Resource, project_id: str,
                    zone: str) -> Optional[List[str]]:
    """gets supported tpu drivers for given project, zone

  Args:
  tpu_api: discovery tpu api resource
  project_id: project id
  zone: zone identifier

  Returns:
  list of supported drivers on success, None otherwise
  """

    location = 'projects/{}/locations/{}'.format(project_id, zone)

    rsp = tpu_api.projects().locations().tensorflowVersions().list(
        parent=location).execute()

    if rsp is None:
        logging.error('error getting tpu drivers')
        return None

    return [d['version'] for d in rsp['tensorflowVersions']]
Beispiel #15
0
def deploy_model_for_aip_prediction(api: discovery.Resource,
                                    serving_path: Text,
                                    model_version: Text,
                                    ai_platform_serving_args: Dict[Text, Any],
                                    job_labels: Dict[Text, Text],
                                    skip_model_creation: bool = False,
                                    set_default_version: bool = True) -> None:
    """Deploys a model for serving with AI Platform.

  Args:
    api: Google API client resource.
    serving_path: The path to the model. Must be a GCS URI.
    model_version: Version of the model being deployed. Must be different from
      what is currently being served.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform. The full set of parameters supported can be found at
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version.
      Most keys are forwarded as-is, but following keys are handled specially:
        - name: this must be empty (and will be filled by pusher).
        - deployment_uri: this must be empty (and will be filled by pusher).
        - python_version: when left empty, this will be filled by python version
            of the environment being used.
        - runtime_version: when left empty, this will be filled by TensorFlow
            version from the environment.
        - labels: a list of job labels will be merged with user's input.
    job_labels: The dict of labels that will be attached to this job. They are
      merged with optional labels from `ai_platform_serving_args`.
    skip_model_creation: If true, the method assuem model already exist in
      AI platform, therefore skipping model creation.
    set_default_version: Whether set the newly deployed model version as the
      default version.

  Raises:
    RuntimeError: if an error is encountered when trying to push.
  """
    logging.info(
        'Deploying to model with version %s to AI Platform for serving: %s',
        model_version, ai_platform_serving_args)

    model_name = ai_platform_serving_args['model_name']
    project_id = ai_platform_serving_args['project_id']
    default_runtime_version = _get_tf_runtime_version(tf.__version__)
    runtime_version = ai_platform_serving_args.get('runtime_version',
                                                   default_runtime_version)
    python_version = _get_caip_python_version(runtime_version)

    if not skip_model_creation:
        create_model_for_aip_prediction_if_not_exist(api, job_labels,
                                                     ai_platform_serving_args)
    version_body = dict(ai_platform_serving_args)
    for model_only_key in ['model_name', 'project_id', 'regions']:
        version_body.pop(model_only_key, None)
    version_body['name'] = model_version
    version_body['deployment_uri'] = serving_path
    version_body['runtime_version'] = version_body.get('runtime_version',
                                                       runtime_version)
    version_body['python_version'] = version_body.get('python_version',
                                                      python_version)
    version_body['labels'] = {**version_body.get('labels', {}), **job_labels}
    logging.info(
        'Creating new version of model_name %s in project %s, request body: %s',
        model_name, project_id, version_body)

    # Push to AIP, and record the operation name so we can poll for its state.
    model_name = 'projects/{}/models/{}'.format(project_id, model_name)
    try:
        operation = api.projects().models().versions().create(
            body=version_body, parent=model_name).execute()
        _wait_for_operation(api, operation, 'projects.models.versions.create')
    except errors.HttpError as e:
        # If the error is to create an already existing model version, it's ok to
        # ignore.
        if e.resp.status == 409:
            logging.warn('Model version %s already exists', model_version)
        else:
            raise RuntimeError(
                'Creating model verseion to AI Platform failed: {}'.format(e))

    if set_default_version:
        # Set the new version as default.
        # By API specification, if Long-Running-Operation is done and there is
        # no error, 'response' is guaranteed to exist.
        api.projects().models().versions().setDefault(
            name='{}/versions/{}'.format(model_name, model_version)).execute()

    logging.info(
        'Successfully deployed model %s with version %s, serving from %s',
        model_name, model_version, serving_path)
Beispiel #16
0
    def create_request(cluster_api: discovery.Resource, creds: Credentials,
                       cluster_name: str, project_id: str, zone: str,
                       release_channel: ReleaseChannel,
                       single_zone: bool) -> Optional[HttpRequest]:
        '''generates cluster create request

    Args:
    cluster_api: cluster api client
    creds: credentials
    cluster_name: name of cluster to create
    project_id: project id
    zone: zone in which to create cluster
          For a single-zone cluster (see below), this zone will contain the
          cluster control plane and all worker nodes. For a multi-zone cluster
          this zone will contain the control plane, but worker nodes can be
          created in any zone in the same region as the control plane.
    release_channel: release channel for cluster
    single_zone: create a single-zone cluster if true, multi-zone otherwise.
                 A single-zone cluster only creates worker nodes in the same
                 zone as the cluster control-plane (specified in the 'zone'
                 argument above), whereas a multi-zone cluster can create
                 worker nodes in every zone in the same region as the
                 cluster control plane. A multi-zone cluster can help
                 job response time when a given zone becomes overburdened.

    Returns:
    HttpRequest on success, None otherwise
    '''

        rz = _parse_zone(zone)
        if rz is None:
            logging.error('invalid zone specified: {}'.format(zone))
            return

        region, _ = rz

        compute_api = discovery.build('compute',
                                      'v1',
                                      credentials=creds,
                                      cache_discovery=False)

        resource_limits = utils.generate_resource_limits(
            compute_api, project_id, region)

        if resource_limits is None:
            logging.error('error generating resource limits')
            return

        if single_zone:
            node_zones = [zone]
        else:
            node_zones = utils.get_zones_in_region(compute_api, project_id,
                                                   region)

        if node_zones is None:
            logging.error('error getting zones for region {}'.format(region))
            return

        request_body = _cluster_create_request_body(
            project_id, zone,
            _create_cluster_spec(cluster_name, zone, node_zones,
                                 resource_limits, release_channel))

        # see https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.zones.clusters/create
        return cluster_api.projects().zones().clusters().create(
            projectId=project_id, zone=zone, body=request_body)
Beispiel #17
0
def deploy_model_for_aip_prediction(api: discovery.Resource,
                                    serving_path: Text,
                                    model_version: Text,
                                    ai_platform_serving_args: Dict[Text, Any],
                                    job_labels: Dict[Text, Text],
                                    skip_model_creation: bool = False,
                                    set_default_version: bool = True) -> None:
    """Deploys a model for serving with AI Platform.

  Args:
    api: Google API client resource.
    serving_path: The path to the model. Must be a GCS URI.
    model_version: Version of the model being deployed. Must be different from
      what is currently being served.
    ai_platform_serving_args: Dictionary containing arguments for pushing to AI
      Platform. For the full set of parameters supported, refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version
    job_labels: The dict of labels that will be attached to this job.
    skip_model_creation: If true, the method assuem model already exist in
      AI platform, therefore skipping model creation.
    set_default_version: Whether set the newly deployed model version as the
      default version.

  Raises:
    RuntimeError: if an error is encountered when trying to push.
  """
    logging.info(
        'Deploying to model with version %s to AI Platform for serving: %s',
        model_version, ai_platform_serving_args)

    model_name = ai_platform_serving_args['model_name']
    project_id = ai_platform_serving_args['project_id']
    default_runtime_version = _get_tf_runtime_version(tf.__version__)
    runtime_version = ai_platform_serving_args.get('runtime_version',
                                                   default_runtime_version)
    python_version = _get_caip_python_version(runtime_version)

    if not skip_model_creation:
        create_model_for_aip_prediction_if_not_exist(api, job_labels,
                                                     ai_platform_serving_args)
    body = {
        'name': model_version,
        'deployment_uri': serving_path,
        'runtime_version': runtime_version,
        'python_version': python_version,
        'labels': job_labels,
    }

    # Push to AIP, and record the operation name so we can poll for its state.
    model_name = 'projects/{}/models/{}'.format(project_id, model_name)
    try:
        operation = api.projects().models().versions().create(
            body=body, parent=model_name).execute()
        _wait_for_operation(api, operation, 'projects.models.versions.create')
    except errors.HttpError as e:
        # If the error is to create an already existing model version, it's ok to
        # ignore.
        if e.resp.status == 409:
            logging.warn('Model version %s already exists', model_version)
        else:
            raise RuntimeError(
                'Creating model verseion to AI Platform failed: {}'.format(e))

    if set_default_version:
        # Set the new version as default.
        # By API specification, if Long-Running-Operation is done and there is
        # no error, 'response' is guaranteed to exist.
        api.projects().models().versions().setDefault(
            name='{}/versions/{}'.format(model_name, model_version)).execute()

    logging.info(
        'Successfully deployed model %s with version %s, serving from %s',
        model_name, model_version, serving_path)