Esempio n. 1
0
 def test_init_succeed_no_argo_node_name(self, mock_k8s_client,
                                         mock_load_config):
     mock_pod = mock_k8s_client().read_namespaced_pod.return_value
     mock_pod.metadata.annotations = {}
     with KfpExecutionContext() as ctx:
         self.assertFalse(ctx.under_kfp_environment())
         pass
Esempio n. 2
0
def wait_job(
    project_id,
    job_id,
    wait_interval=30,
    show_tensorboard=True,
    job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
    job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
    job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
):
    """Waits a MLEngine job.

    Args:
        project_id (str): Required. The ID of the parent project of the job.
        job_id (str): Required. The ID of the job to wait.
        wait_interval (int): optional wait interval between calls
            to get job status. Defaults to 30.
        show_tensorboard (bool): optional. True to dump Tensorboard metadata.
        job_object_output_path: Path for the json payload of the waiting job.
        job_id_output_path: Path for the ID of the waiting job.
        job_dir_output_path: Path for the `jobDir` of the waiting job.
    """
    ml_client = MLEngineClient()
    with KfpExecutionContext(on_cancel=lambda: cancel_job(ml_client, project_id, job_id)):
        return wait_for_job_done(
            ml_client=ml_client,
            project_id=project_id,
            job_id=job_id,
            wait_interval=wait_interval,
            show_tensorboard=show_tensorboard,
            job_object_output_path=job_object_output_path,
            job_id_output_path=job_id_output_path,
            job_dir_output_path=job_dir_output_path,
        )
Esempio n. 3
0
    def test_init_succeed_when_load_pod_fail(self, 
        mock_k8s_client, mock_load_config):
        mock_k8s_client().read_namespaced_pod.side_effect = Exception()

        with KfpExecutionContext() as ctx:
            self.assertFalse(ctx.under_kfp_environment())
            pass
Esempio n. 4
0
 def test_init_succeed(self, mock_k8s_client, mock_load_config):
     mock_pod = mock_k8s_client().read_namespaced_pod.return_value
     mock_pod.metadata.annotations = {
         'workflows.argoproj.io/node-name': 'node-1'
     }
     with KfpExecutionContext() as ctx:
         self.assertTrue(ctx.under_kfp_environment())
         pass
Esempio n. 5
0
def launch_python(python_file_path,
                  project_id,
                  requirements_file_path=None,
                  location=None,
                  job_name_prefix=None,
                  args=[],
                  wait_interval=30):
    """Launch a self-executing beam python file.

    Args:
        python_file_path (str): The gcs or local path to the python file to run.
        project_id (str): The ID of the parent project.
        requirements_file_path (str): Optional, the gcs or local path to the pip 
            requirements file.
        location (str): The regional endpoint to which to direct the 
            request.
        job_name_prefix (str): Optional. The prefix of the genrated job
            name. If not provided, the method will generated a random name.
        args (list): The list of args to pass to the python file.
        wait_interval (int): The wait seconds between polling.
    Returns:
        The completed job.
    """
    df_client = DataflowClient()
    job_id = None

    def cancel():
        if job_id:
            df_client.cancel_job(project_id, job_id, location)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_name = generate_job_name(job_name_prefix, ctx.context_id())
        # We will always generate unique name for the job. We expect
        # job with same name was created in previous tries from the same
        # pipeline run.
        job = get_job_by_name(df_client, project_id, job_name, location)
        if job:
            return wait_and_dump_job(df_client, project_id, location, job,
                                     wait_interval)

        _install_requirements(requirements_file_path)
        python_file_path = stage_file(python_file_path)
        cmd = _prepare_cmd(project_id, location, job_name, python_file_path,
                           args)
        sub_process = Process(cmd)
        for line in sub_process.read_lines():
            job_id = _extract_job_id(line)
            if job_id:
                logging.info('Found job id {}'.format(job_id))
                break
        sub_process.wait_and_check()
        if not job_id:
            logging.warning('No dataflow job was found when '
                            'running the python file.')
            return None
        job = df_client.get_job(project_id, job_id, location=location)
        return wait_and_dump_job(df_client, project_id, location, job,
                                 wait_interval)
Esempio n. 6
0
def query(query, project_id, dataset_id=None, table_id=None, 
    output_gcs_path=None, dataset_location='US', job_config=None):
    """Submit a query to Bigquery service and dump outputs to a GCS blob.
    
    Args:
        query (str): The query used by Bigquery service to fetch the results.
        project_id (str): The project to execute the query job.
        dataset_id (str): The ID of the persistent dataset to keep the results
            of the query. If the dataset does not exist, the operation will 
            create a new one.
        table_id (str): The ID of the table to keep the results of the query. If
            absent, the operation will generate a random id for the table.
        output_gcs_path (str): The GCS blob path to dump the query results to.
        dataset_location (str): The location to create the dataset. Defaults to `US`.
        job_config (dict): The full config spec for the query job.
    Returns:
        The API representation of the completed query job.
    """
    client = bigquery.Client(project=project_id)
    if not job_config:
        job_config = bigquery.QueryJobConfig()
    job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
    job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
    job_id = None
    def cancel():
        if job_id:
            client.cancel_job(job_id)
    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_id = 'query_' + ctx.context_id()
        query_job = _get_job(client, job_id)
        table_ref = None
        if not query_job:
            dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path, 
                dataset_location)
            if dataset_ref:
                if not table_id:
                    table_id = job_id
                table_ref = dataset_ref.table(table_id)
                job_config.destination = table_ref
            query_job = client.query(query, job_config, job_id=job_id)
        _display_job_link(project_id, job_id)
        query_result = query_job.result()
        if output_gcs_path:
            job_id = 'extract_' + ctx.context_id()
            extract_job = _get_job(client, job_id)
            logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
            if not extract_job:
                extract_job = client.extract_table(table_ref, output_gcs_path)
            extract_job.result()  # Wait for export to finish
        else:
            result_path = KFP_OUTPUT_PATH + 'bigquery/query_output.csv'
            logging.info('Dumping results to {}.'.format(result_path))
            # Download results to local disk if no gcs output path.
            gcp_common.dump_file(result_path, query_result.to_dataframe().to_csv())
        _dump_outputs(query_job, output_gcs_path)
        return query_job.to_api_repr()
Esempio n. 7
0
 def execute_and_wait(self):
     with KfpExecutionContext(on_cancel=self._cancel) as ctx:
         self._set_job_id(ctx.context_id())
         self._dump_metadata()
         self._create_job()
         finished_job = self._wait_for_done()
         self._dump_job(finished_job)
         if finished_job['state'] != 'SUCCEEDED':
             raise RuntimeError('Job failed with state {}. Error: {}'.format(
                 finished_job['state'], finished_job.get('errorMessage', '')))
         return finished_job
Esempio n. 8
0
def launch_template(project_id,
                    gcs_path,
                    launch_parameters,
                    location=None,
                    job_name_prefix=None,
                    validate_only=None,
                    wait_interval=30):
    """Launchs a dataflow job from template.

    Args:
        project_id (str): Required. The ID of the Cloud Platform project 
            that the job belongs to.
        gcs_path (str): Required. A Cloud Storage path to the template 
            from which to create the job. Must be valid Cloud 
            Storage URL, beginning with 'gs://'.
        launch_parameters (dict): Parameters to provide to the template 
            being launched. Schema defined in 
            https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters.
            `jobName` will be replaced by generated name.
        location (str): The regional endpoint to which to direct the 
            request.
        job_name_prefix (str): Optional. The prefix of the genrated job
            name. If not provided, the method will generated a random name.
        validate_only (boolean): If true, the request is validated but 
            not actually executed. Defaults to false.
        wait_interval (int): The wait seconds between polling.
    
    Returns:
        The completed job.
    """
    df_client = DataflowClient()
    job_id = None

    def cancel():
        if job_id:
            df_client.cancel_job(project_id, job_id, location)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_name = generate_job_name(job_name_prefix, ctx.context_id())
        print(job_name)
        job = get_job_by_name(df_client, project_id, job_name, location)
        if not job:
            launch_parameters['jobName'] = job_name
            response = df_client.launch_template(project_id, gcs_path,
                                                 location, validate_only,
                                                 launch_parameters)
            job = response.get('job', None)
        if not job:
            # Validate only mode
            return job
        return wait_and_dump_job(df_client, project_id, location, job,
                                 wait_interval)
 def execute_and_wait(self):
     with KfpExecutionContext(on_cancel=lambda: cancel_job(
             self._ml, self._project_id, self._job_id)) as ctx:
         self._set_job_id(ctx.context_id())
         self._create_job()
         return wait_for_job_done(
             self._ml,
             self._project_id,
             self._job_id,
             self._wait_interval,
             job_object_output_path=self._job_object_output_path,
             job_id_output_path=self._job_id_output_path,
             job_dir_output_path=self._job_dir_output_path,
         )
Esempio n. 10
0
def _create_cluster_internal(project_id, region, cluster, name_prefix, 
    wait_interval):
    client = DataprocClient()
    operation_name = None
    with KfpExecutionContext(
        on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
        _set_cluster_name(cluster, ctx.context_id(), name_prefix)
        _dump_metadata(cluster, region)
        operation = client.create_cluster(project_id, region, cluster, 
            request_id=ctx.context_id())
        operation_name = operation.get('name')
        operation = client.wait_for_operation_done(operation_name, 
            wait_interval)
        return _dump_cluster(operation.get('response'))
Esempio n. 11
0
def submit_job(
    project_id,
    region,
    cluster_name,
    job,
    wait_interval=30,
    job_id_output_path='/tmp/kfp/output/dataproc/job_id.txt',
    job_object_output_path='/tmp/kfp/output/dataproc/job.json',
):
    """Submits a Cloud Dataproc job.
    
    Args:
        project_id (str): Required. The ID of the Google Cloud Platform project 
            that the cluster belongs to.
        region (str): Required. The Cloud Dataproc region in which to handle the 
            request.
        cluster_name (str): Required. The cluster to run the job.
        job (dict): Optional. The full payload of a [Dataproc job](
            https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
        wait_interval (int): The wait seconds between polling the operation. 
            Defaults to 30s.
        job_id_output_path (str): Path for the ID of the created job
        job_object_output_path (str): Path for the created job object

    Returns:
        The created job payload.
    """
    if 'reference' not in job:
        job['reference'] = {}
    job['reference']['projectId'] = project_id
    if 'placement' not in job:
        job['placement'] = {}
    job['placement']['clusterName'] = cluster_name
    client = DataprocClient()
    job_id = None
    with KfpExecutionContext(on_cancel=lambda: client.cancel_job(
            project_id, region, job_id)) as ctx:
        submitted_job = client.submit_job(project_id,
                                          region,
                                          job,
                                          request_id=ctx.context_id())
        job_id = submitted_job['reference']['jobId']
        _dump_metadata(submitted_job, region)
        submitted_job = _wait_for_job_done(client, project_id, region, job_id,
                                           wait_interval)
        gcp_common.dump_file(job_object_output_path, json.dumps(submitted_job))
        gcp_common.dump_file(job_id_output_path,
                             submitted_job.get('reference').get('jobId'))
        return submitted_job
Esempio n. 12
0
    def test_context_id_stable_across_retries(self, 
        mock_k8s_client, mock_load_config):
        mock_pod = mock_k8s_client().read_namespaced_pod.return_value
        mock_pod.metadata.annotations = {
            'workflows.argoproj.io/node-name': 'node-1'
        }
        ctx1 = KfpExecutionContext()
        ctx2 = KfpExecutionContext()

        self.assertEqual(ctx1.context_id(), ctx2.context_id())
Esempio n. 13
0
def wait_job(project_id, job_id, wait_interval=30):
    """Waits a MLEngine job.

    Args:
        project_id (str): Required. The ID of the parent project of the job.
        job_id (str): Required. The ID of the job to wait.
        wait_interval (int): optional wait interval between calls
            to get job status. Defaults to 30.

    Outputs:
        /tmp/kfp/output/ml_engine/job.json: The json payload of the waiting job.
        /tmp/kfp/output/ml_engine/job_id.txt: The ID of the waiting job.
        /tmp/kfp/output/ml_engine/job_dir.txt: The `jobDir` of the waiting job.
    """
    ml_client = MLEngineClient()
    with KfpExecutionContext(
            on_cancel=lambda: cancel_job(ml_client, project_id, job_id)):
        return wait_for_job_done(ml_client, project_id, job_id, wait_interval)
Esempio n. 14
0
    def execute_and_wait(self):
        with KfpExecutionContext(on_cancel=self._cancel) as ctx:
            self._set_version_name(ctx.context_id())
            self._dump_metadata()
            existing_version = wait_existing_version(self._ml, 
                self._version_name, 
                self._wait_interval)
            if existing_version and self._is_dup_version(existing_version):
                return self._handle_completed_version(existing_version)

            if existing_version and self._replace_existing:
                logging.info('Deleting existing version...')
                self._delete_version_and_wait()
            elif existing_version:
                raise RuntimeError(
                    'Existing version conflicts with the name of the new version.')
            
            created_version = self._create_version_and_wait()
            return self._handle_completed_version(created_version)
    def execute_and_wait(self):
        with KfpExecutionContext(on_cancel=self._cancel):
            existing_version = wait_existing_version(self._ml,
                                                     self._version_name,
                                                     self._wait_interval)
            if not existing_version:
                logging.info('The version has already been deleted.')
                return None

            logging.info('Deleting existing version...')
            operation = self._ml.delete_version(self._version_name)
            # Cache operation name for cancellation.
            self._delete_operation_name = operation.get('name')
            try:
                wait_for_operation_done(self._ml, self._delete_operation_name,
                                        'delete version', self._wait_interval)
            finally:
                self._delete_operation_name = None
            return None
Esempio n. 16
0
 def execute(self):
     with KfpExecutionContext() as ctx:
         self._set_model_name(ctx.context_id())
         self._dump_metadata()
         try:
             created_model = self._ml.create_model(
                 project_id=self._project_id, model=self._model)
         except errors.HttpError as e:
             if e.resp.status == 409:
                 existing_model = self._ml.get_model(self._model_name)
                 if not self._is_dup_model(existing_model):
                     raise
                 logging.info('The same model {} has been submitted'
                              ' before. Continue the operation.'.format(
                                  self._model_name))
                 created_model = existing_model
             else:
                 raise
         self._dump_model(created_model)
         return created_model
Esempio n. 17
0
def submit_job(project_id, region, cluster_name, job, wait_interval=30):
    """Submits a Cloud Dataproc job.
    
    Args:
        project_id (str): Required. The ID of the Google Cloud Platform project 
            that the cluster belongs to.
        region (str): Required. The Cloud Dataproc region in which to handle the 
            request.
        cluster_name (str): Required. The cluster to run the job.
        job (dict): Optional. The full payload of a [Dataproc job](
            https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
        wait_interval (int): The wait seconds between polling the operation. 
            Defaults to 30s.

    Returns:
        The created job payload.

    Output Files:
        $KFP_OUTPUT_PATH/dataproc/job_id.txt: The ID of the created job.
    """
    if 'reference' not in job:
        job['reference'] = {}
    job['reference']['projectId'] = project_id
    if 'placement' not in job:
        job['placement'] = {}
    job['placement']['clusterName'] = cluster_name
    client = DataprocClient()
    operation_name = None
    with KfpExecutionContext(
            on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
        submitted_job = client.submit_job(project_id,
                                          region,
                                          job,
                                          request_id=ctx.context_id())
        _dump_metadata(submitted_job, region)
        submitted_job = _wait_for_job_done(client, project_id, region,
                                           submitted_job['reference']['jobId'],
                                           wait_interval)
        return _dump_job(submitted_job)
Esempio n. 18
0
def delete_cluster(project_id, region, name, wait_interval=30):
    """Deletes a DataProc cluster.
    
    Args:
        project_id (str): Required. The ID of the Google Cloud Platform project 
            that the cluster belongs to.
        region (str): Required. The Cloud Dataproc region in which to handle the 
            request.
        name (str): Required. The cluster name to delete.
        wait_interval (int): The wait seconds between polling the operation. 
            Defaults to 30s.

    """
    client = DataprocClient()
    operation_name = None
    with KfpExecutionContext(
            on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
        operation = client.delete_cluster(project_id,
                                          region,
                                          name,
                                          request_id=ctx.context_id())
        operation_name = operation.get('name')
        return client.wait_for_operation_done(operation_name, wait_interval)
Esempio n. 19
0
def launch_flex_template(
    project_id,
    location,
    launch_parameters,
    validate_only=False,
    staging_dir=None,
    wait_interval=30,
    job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
    job_object_output_path='/tmp/kfp/output/dataflow/job.json',
):
    """Launches a dataflow job from a flex template.

    Args:
        project_id (str): Required. The ID of the Cloud Platform project that the job belongs to.
        location (str): The regional endpoint to which to direct the request.
        launch_parameters (dict): Parameters to provide to the template
            being launched. Schema defined in
            https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#LaunchFlexTemplateParameter.
            `jobName` will be replaced by generated name.
        validate_only (boolean): If true, the request is validated but
            not actually executed. Defaults to false.
        staging_dir (str): Optional. The GCS directory for keeping staging files.
            A random subdirectory will be created under the directory to keep job info
            for resuming the job in case of failure.
        wait_interval (int): The wait seconds between polling.
        job_id_output_path (str): Optional. Output file to save job_id of execution
        job_object_output_path (str): Optional. Output file to save job details of execution

    Returns:
        The completed job.
    """
    storage_client = storage.Client()
    df_client = DataflowClient()
    job_id = None

    def cancel():
        if job_id:
            df_client.cancel_job(project_id, job_id, location)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        staging_location = get_staging_location(staging_dir, ctx.context_id())
        job_id, _ = read_job_id_and_location(storage_client, staging_location)
        # Continue waiting for the job if it's has been uploaded to staging location.
        if job_id:
            job = df_client.get_job(project_id, job_id, location)
            job = wait_and_dump_job(
                df_client,
                project_id,
                location,
                job,
                wait_interval,
                job_id_output_path=job_id_output_path,
                job_object_output_path=job_object_output_path,
            )
            logging.info(f'Skipping, existing job: {job}')
            return job

        if launch_parameters is None:
            launch_parameters = {}

        request_body = {
            'launchParameter': launch_parameters,
            'validateOnly': validate_only
        }

        request_body['launchParameter']['jobName'] = 'job-' + ctx.context_id()

        response = df_client.launch_flex_template(
            project_id, request_body, location
        )

        job = response.get('job', None)
        if not job:
            # Validate only mode
            return job

        job_id = job.get('id')
        upload_job_id_and_location(
            storage_client, staging_location, job_id, location
        )
        job = wait_and_dump_job(
            df_client,
            project_id,
            location,
            job,
            wait_interval,
            job_id_output_path=job_id_output_path,
            job_object_output_path=job_object_output_path,
        )
        logging.info(f'Completed job: {job}')
        return job
def launch_template(project_id,
                    gcs_path,
                    launch_parameters,
                    location=None,
                    validate_only=None,
                    staging_dir=None,
                    wait_interval=30):
    """Launchs a dataflow job from template.

    Args:
        project_id (str): Required. The ID of the Cloud Platform project 
            that the job belongs to.
        gcs_path (str): Required. A Cloud Storage path to the template 
            from which to create the job. Must be valid Cloud 
            Storage URL, beginning with 'gs://'.
        launch_parameters (dict): Parameters to provide to the template 
            being launched. Schema defined in 
            https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters.
            `jobName` will be replaced by generated name.
        location (str): The regional endpoint to which to direct the 
            request.
        validate_only (boolean): If true, the request is validated but 
            not actually executed. Defaults to false.
        staging_dir (str): Optional. The GCS directory for keeping staging files. 
            A random subdirectory will be created under the directory to keep job info
            for resuming the job in case of failure.
        wait_interval (int): The wait seconds between polling.
    
    Returns:
        The completed job.
    """
    storage_client = storage.Client()
    df_client = DataflowClient()
    job_id = None

    def cancel():
        if job_id:
            df_client.cancel_job(project_id, job_id, location)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        staging_location = get_staging_location(staging_dir, ctx.context_id())
        job_id, _ = read_job_id_and_location(storage_client, staging_location)
        # Continue waiting for the job if it's has been uploaded to staging location.
        if job_id:
            job = df_client.get_job(project_id, job_id, location)
            return wait_and_dump_job(df_client, project_id, location, job,
                                     wait_interval)

        if not launch_parameters:
            launch_parameters = {}
        launch_parameters['jobName'] = 'job-' + ctx.context_id()
        response = df_client.launch_template(project_id, gcs_path, location,
                                             validate_only, launch_parameters)
        job = response.get('job', None)
        if not job:
            # Validate only mode
            return job
        job_id = job.get('id')
        upload_job_id_and_location(storage_client, staging_location, job_id,
                                   location)
        return wait_and_dump_job(df_client, project_id, location, job,
                                 wait_interval)
Esempio n. 21
0
def query(
    query,
    project_id,
    dataset_id=None,
    table_id=None,
    output_gcs_path=None,
    dataset_location='US',
    job_config=None,
    output_path=None,
    output_filename=None,
    output_destination_format="CSV",
    job_object_output_path='/tmp/kfp/output/bigquery/query-job.json',
    output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt',
    output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt',
    output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt',
):
    """Submit a query to Bigquery service and dump outputs to Bigquery table or 
    a GCS blob.
    
    Args:
        query (str): The query used by Bigquery service to fetch the results.
        project_id (str): The project to execute the query job.
        dataset_id (str): The ID of the persistent dataset to keep the results
            of the query. If the dataset does not exist, the operation will 
            create a new one.
        table_id (str): The ID of the table to keep the results of the query. If
            absent, the operation will generate a random id for the table.
        output_gcs_path (str): The GCS blob path to dump the query results to.
        dataset_location (str): The location to create the dataset. Defaults to `US`.
        job_config (dict): The full config spec for the query job.
        output_path (str): The path to where query result will be stored
        output_filename (str): The name of the file where the results will be stored
        output_destination_format (str): The name of the output destination format.
            Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.
    Returns:
        The API representation of the completed query job.
    """
    client = bigquery.Client(project=project_id, location=dataset_location)
    if not job_config:
        job_config = bigquery.QueryJobConfig()
        job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
        job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
    else:
        job_config = bigquery.QueryJobConfig.from_api_repr(job_config)
    job_id = None

    def cancel():
        if job_id:
            client.cancel_job(job_id)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_id = 'query_' + ctx.context_id()
        query_job = _get_job(client, job_id)
        table_ref = None
        if not query_job:
            dataset_ref = _prepare_dataset_ref(client, dataset_id,
                                               output_gcs_path,
                                               dataset_location)
            if dataset_ref:
                if not table_id:
                    table_id = job_id
                table_ref = dataset_ref.table(table_id)
                job_config.destination = table_ref
                gcp_common.dump_file(output_dataset_id_output_path,
                                     table_ref.dataset_id)
                gcp_common.dump_file(output_table_id_output_path,
                                     table_ref.table_id)
            query_job = client.query(query, job_config, job_id=job_id)
        _display_job_link(project_id, job_id)
        if output_path != None:  #Write to local file
            result = query_job.result()
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            df = result.to_dataframe()
            df.to_csv(os.path.join(output_path, output_filename))
        else:
            query_job.result()
            if output_gcs_path:
                job_id = 'extract_' + ctx.context_id()
                extract_job = _get_job(client, job_id)
                logging.info('Extracting data from table {} to {}.'.format(
                    str(table_ref), output_gcs_path))
                if not extract_job:
                    job_config = ExtractJobConfig(
                        destination_format=output_destination_format)
                    extract_job = client.extract_table(table_ref,
                                                       output_gcs_path,
                                                       job_config=job_config)
                extract_job.result()  # Wait for export to finish
            # TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo/pull/1653
            gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path
                                 or '-')

        gcp_common.dump_file(job_object_output_path,
                             json.dumps(query_job.to_api_repr()))
        return query_job.to_api_repr()
Esempio n. 22
0
 def test_init_succeed_without_pod_name(self, 
     mock_k8s_client, mock_load_config):
     with KfpExecutionContext() as ctx:
         self.assertFalse(ctx.under_kfp_environment())
         pass
Esempio n. 23
0
def create_cluster(
    project_id,
    region,
    name=None,
    name_prefix=None,
    initialization_actions=None,
    config_bucket=None,
    image_version=None,
    cluster=None,
    wait_interval=30,
    cluster_name_output_path='/tmp/kfp/output/dataproc/cluster_name.txt',
    cluster_object_output_path='/tmp/kfp/output/dataproc/cluster.json',
):
    """Creates a DataProc cluster under a project.

    Args:
        project_id (str): Required. The ID of the Google Cloud Platform project 
            that the cluster belongs to.
        region (str): Required. The Cloud Dataproc region in which to handle the 
            request.
        name (str): Optional. The cluster name. Cluster names within a project
            must be unique. Names of deleted clusters can be reused.
        name_prefix (str): Optional. The prefix of the cluster name.
        initialization_actions (list): Optional. List of GCS URIs of executables 
            to execute on each node after config is completed. By default,
            executables are run on master and all worker nodes. 
        config_bucket (str): Optional. A Google Cloud Storage bucket used to 
            stage job dependencies, config files, and job driver console output.
        image_version (str): Optional. The version of software inside the cluster.
        cluster (dict): Optional. The full cluster config. See [full details](
            https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster)
        wait_interval (int): The wait seconds between polling the operation. 
            Defaults to 30s.

    Returns:
        The created cluster object.

    Output Files:
        $KFP_OUTPUT_PATH/dataproc/cluster_name.txt: The cluster name of the 
            created cluster.
    """
    if not cluster:
        cluster = {}
    cluster['projectId'] = project_id
    if 'config' not in cluster:
        cluster['config'] = {}
    if name:
        cluster['clusterName'] = name
    if initialization_actions:
        cluster['config']['initializationActions'] = list(
            map(lambda file: {'executableFile': file}, initialization_actions))
    if config_bucket:
        cluster['config']['configBucket'] = config_bucket
    if image_version:
        if 'softwareConfig' not in cluster['config']:
            cluster['config']['softwareConfig'] = {}
        cluster['config']['softwareConfig']['imageVersion'] = image_version

    client = DataprocClient()
    operation_name = None
    with KfpExecutionContext(
            on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
        _set_cluster_name(cluster, ctx.context_id(), name_prefix)
        _dump_metadata(cluster, region)
        operation = client.create_cluster(project_id,
                                          region,
                                          cluster,
                                          request_id=ctx.context_id())
        operation_name = operation.get('name')
        operation = client.wait_for_operation_done(operation_name,
                                                   wait_interval)
        cluster = operation.get('response')
        gcp_common.dump_file(cluster_object_output_path, json.dumps(cluster))
        gcp_common.dump_file(cluster_name_output_path,
                             cluster.get('clusterName'))
        return cluster
Esempio n. 24
0
def launch_python(
    python_file_path,
    project_id,
    staging_dir=None,
    requirements_file_path=None,
    args=[],
    wait_interval=30,
    job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
    job_object_output_path='/tmp/kfp/output/dataflow/job.json',
):
    """Launch a self-executing beam python file.

    Args:
        python_file_path (str): The gcs or local path to the python file to run.
        project_id (str): The ID of the parent project.
        staging_dir (str): Optional. The GCS directory for keeping staging files. 
            A random subdirectory will be created under the directory to keep job info
            for resuming the job in case of failure and it will be passed as 
            `staging_location` and `temp_location` command line args of the beam code.
        requirements_file_path (str): Optional, the gcs or local path to the pip 
            requirements file.
        args (list): The list of args to pass to the python file.
        wait_interval (int): The wait seconds between polling.
    Returns:
        The completed job.
    """
    storage_client = storage.Client()
    df_client = DataflowClient()
    job_id = None
    location = None

    def cancel():
        if job_id:
            df_client.cancel_job(project_id, job_id, location)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        staging_location = get_staging_location(staging_dir, ctx.context_id())
        job_id, location = read_job_id_and_location(storage_client,
                                                    staging_location)
        # Continue waiting for the job if it's has been uploaded to staging location.
        if job_id:
            job = df_client.get_job(project_id, job_id, location)
            return wait_and_dump_job(
                df_client,
                project_id,
                location,
                job,
                wait_interval,
                job_id_output_path=job_id_output_path,
                job_object_output_path=job_object_output_path,
            )

        _install_requirements(requirements_file_path)
        python_file_path = stage_file(python_file_path)
        cmd = _prepare_cmd(project_id, python_file_path, args,
                           staging_location)
        sub_process = Process(cmd)
        for line in sub_process.read_lines():
            job_id, location = _extract_job_id_and_location(line)
            if job_id:
                logging.info('Found job id {} and location {}.'.format(
                    job_id, location))
                upload_job_id_and_location(storage_client, staging_location,
                                           job_id, location)
                break
        sub_process.wait_and_check()
        if not job_id:
            logging.warning('No dataflow job was found when '
                            'running the python file.')
            return None
        job = df_client.get_job(project_id, job_id, location=location)
        return wait_and_dump_job(
            df_client,
            project_id,
            location,
            job,
            wait_interval,
            job_id_output_path=job_id_output_path,
            job_object_output_path=job_object_output_path,
        )