def test_init_succeed_no_argo_node_name(self, mock_k8s_client, mock_load_config): mock_pod = mock_k8s_client().read_namespaced_pod.return_value mock_pod.metadata.annotations = {} with KfpExecutionContext() as ctx: self.assertFalse(ctx.under_kfp_environment()) pass
def wait_job( project_id, job_id, wait_interval=30, show_tensorboard=True, job_object_output_path='/tmp/kfp/output/ml_engine/job.json', job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt', job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt', ): """Waits a MLEngine job. Args: project_id (str): Required. The ID of the parent project of the job. job_id (str): Required. The ID of the job to wait. wait_interval (int): optional wait interval between calls to get job status. Defaults to 30. show_tensorboard (bool): optional. True to dump Tensorboard metadata. job_object_output_path: Path for the json payload of the waiting job. job_id_output_path: Path for the ID of the waiting job. job_dir_output_path: Path for the `jobDir` of the waiting job. """ ml_client = MLEngineClient() with KfpExecutionContext(on_cancel=lambda: cancel_job(ml_client, project_id, job_id)): return wait_for_job_done( ml_client=ml_client, project_id=project_id, job_id=job_id, wait_interval=wait_interval, show_tensorboard=show_tensorboard, job_object_output_path=job_object_output_path, job_id_output_path=job_id_output_path, job_dir_output_path=job_dir_output_path, )
def test_init_succeed_when_load_pod_fail(self, mock_k8s_client, mock_load_config): mock_k8s_client().read_namespaced_pod.side_effect = Exception() with KfpExecutionContext() as ctx: self.assertFalse(ctx.under_kfp_environment()) pass
def test_init_succeed(self, mock_k8s_client, mock_load_config): mock_pod = mock_k8s_client().read_namespaced_pod.return_value mock_pod.metadata.annotations = { 'workflows.argoproj.io/node-name': 'node-1' } with KfpExecutionContext() as ctx: self.assertTrue(ctx.under_kfp_environment()) pass
def launch_python(python_file_path, project_id, requirements_file_path=None, location=None, job_name_prefix=None, args=[], wait_interval=30): """Launch a self-executing beam python file. Args: python_file_path (str): The gcs or local path to the python file to run. project_id (str): The ID of the parent project. requirements_file_path (str): Optional, the gcs or local path to the pip requirements file. location (str): The regional endpoint to which to direct the request. job_name_prefix (str): Optional. The prefix of the genrated job name. If not provided, the method will generated a random name. args (list): The list of args to pass to the python file. wait_interval (int): The wait seconds between polling. Returns: The completed job. """ df_client = DataflowClient() job_id = None def cancel(): if job_id: df_client.cancel_job(project_id, job_id, location) with KfpExecutionContext(on_cancel=cancel) as ctx: job_name = generate_job_name(job_name_prefix, ctx.context_id()) # We will always generate unique name for the job. We expect # job with same name was created in previous tries from the same # pipeline run. job = get_job_by_name(df_client, project_id, job_name, location) if job: return wait_and_dump_job(df_client, project_id, location, job, wait_interval) _install_requirements(requirements_file_path) python_file_path = stage_file(python_file_path) cmd = _prepare_cmd(project_id, location, job_name, python_file_path, args) sub_process = Process(cmd) for line in sub_process.read_lines(): job_id = _extract_job_id(line) if job_id: logging.info('Found job id {}'.format(job_id)) break sub_process.wait_and_check() if not job_id: logging.warning('No dataflow job was found when ' 'running the python file.') return None job = df_client.get_job(project_id, job_id, location=location) return wait_and_dump_job(df_client, project_id, location, job, wait_interval)
def query(query, project_id, dataset_id=None, table_id=None, output_gcs_path=None, dataset_location='US', job_config=None): """Submit a query to Bigquery service and dump outputs to a GCS blob. Args: query (str): The query used by Bigquery service to fetch the results. project_id (str): The project to execute the query job. dataset_id (str): The ID of the persistent dataset to keep the results of the query. If the dataset does not exist, the operation will create a new one. table_id (str): The ID of the table to keep the results of the query. If absent, the operation will generate a random id for the table. output_gcs_path (str): The GCS blob path to dump the query results to. dataset_location (str): The location to create the dataset. Defaults to `US`. job_config (dict): The full config spec for the query job. Returns: The API representation of the completed query job. """ client = bigquery.Client(project=project_id) if not job_config: job_config = bigquery.QueryJobConfig() job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE job_id = None def cancel(): if job_id: client.cancel_job(job_id) with KfpExecutionContext(on_cancel=cancel) as ctx: job_id = 'query_' + ctx.context_id() query_job = _get_job(client, job_id) table_ref = None if not query_job: dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location) if dataset_ref: if not table_id: table_id = job_id table_ref = dataset_ref.table(table_id) job_config.destination = table_ref query_job = client.query(query, job_config, job_id=job_id) _display_job_link(project_id, job_id) query_result = query_job.result() if output_gcs_path: job_id = 'extract_' + ctx.context_id() extract_job = _get_job(client, job_id) logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path)) if not extract_job: extract_job = client.extract_table(table_ref, output_gcs_path) extract_job.result() # Wait for export to finish else: result_path = KFP_OUTPUT_PATH + 'bigquery/query_output.csv' logging.info('Dumping results to {}.'.format(result_path)) # Download results to local disk if no gcs output path. gcp_common.dump_file(result_path, query_result.to_dataframe().to_csv()) _dump_outputs(query_job, output_gcs_path) return query_job.to_api_repr()
def execute_and_wait(self): with KfpExecutionContext(on_cancel=self._cancel) as ctx: self._set_job_id(ctx.context_id()) self._dump_metadata() self._create_job() finished_job = self._wait_for_done() self._dump_job(finished_job) if finished_job['state'] != 'SUCCEEDED': raise RuntimeError('Job failed with state {}. Error: {}'.format( finished_job['state'], finished_job.get('errorMessage', ''))) return finished_job
def launch_template(project_id, gcs_path, launch_parameters, location=None, job_name_prefix=None, validate_only=None, wait_interval=30): """Launchs a dataflow job from template. Args: project_id (str): Required. The ID of the Cloud Platform project that the job belongs to. gcs_path (str): Required. A Cloud Storage path to the template from which to create the job. Must be valid Cloud Storage URL, beginning with 'gs://'. launch_parameters (dict): Parameters to provide to the template being launched. Schema defined in https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters. `jobName` will be replaced by generated name. location (str): The regional endpoint to which to direct the request. job_name_prefix (str): Optional. The prefix of the genrated job name. If not provided, the method will generated a random name. validate_only (boolean): If true, the request is validated but not actually executed. Defaults to false. wait_interval (int): The wait seconds between polling. Returns: The completed job. """ df_client = DataflowClient() job_id = None def cancel(): if job_id: df_client.cancel_job(project_id, job_id, location) with KfpExecutionContext(on_cancel=cancel) as ctx: job_name = generate_job_name(job_name_prefix, ctx.context_id()) print(job_name) job = get_job_by_name(df_client, project_id, job_name, location) if not job: launch_parameters['jobName'] = job_name response = df_client.launch_template(project_id, gcs_path, location, validate_only, launch_parameters) job = response.get('job', None) if not job: # Validate only mode return job return wait_and_dump_job(df_client, project_id, location, job, wait_interval)
def execute_and_wait(self): with KfpExecutionContext(on_cancel=lambda: cancel_job( self._ml, self._project_id, self._job_id)) as ctx: self._set_job_id(ctx.context_id()) self._create_job() return wait_for_job_done( self._ml, self._project_id, self._job_id, self._wait_interval, job_object_output_path=self._job_object_output_path, job_id_output_path=self._job_id_output_path, job_dir_output_path=self._job_dir_output_path, )
def _create_cluster_internal(project_id, region, cluster, name_prefix, wait_interval): client = DataprocClient() operation_name = None with KfpExecutionContext( on_cancel=lambda: client.cancel_operation(operation_name)) as ctx: _set_cluster_name(cluster, ctx.context_id(), name_prefix) _dump_metadata(cluster, region) operation = client.create_cluster(project_id, region, cluster, request_id=ctx.context_id()) operation_name = operation.get('name') operation = client.wait_for_operation_done(operation_name, wait_interval) return _dump_cluster(operation.get('response'))
def submit_job( project_id, region, cluster_name, job, wait_interval=30, job_id_output_path='/tmp/kfp/output/dataproc/job_id.txt', job_object_output_path='/tmp/kfp/output/dataproc/job.json', ): """Submits a Cloud Dataproc job. Args: project_id (str): Required. The ID of the Google Cloud Platform project that the cluster belongs to. region (str): Required. The Cloud Dataproc region in which to handle the request. cluster_name (str): Required. The cluster to run the job. job (dict): Optional. The full payload of a [Dataproc job]( https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). wait_interval (int): The wait seconds between polling the operation. Defaults to 30s. job_id_output_path (str): Path for the ID of the created job job_object_output_path (str): Path for the created job object Returns: The created job payload. """ if 'reference' not in job: job['reference'] = {} job['reference']['projectId'] = project_id if 'placement' not in job: job['placement'] = {} job['placement']['clusterName'] = cluster_name client = DataprocClient() job_id = None with KfpExecutionContext(on_cancel=lambda: client.cancel_job( project_id, region, job_id)) as ctx: submitted_job = client.submit_job(project_id, region, job, request_id=ctx.context_id()) job_id = submitted_job['reference']['jobId'] _dump_metadata(submitted_job, region) submitted_job = _wait_for_job_done(client, project_id, region, job_id, wait_interval) gcp_common.dump_file(job_object_output_path, json.dumps(submitted_job)) gcp_common.dump_file(job_id_output_path, submitted_job.get('reference').get('jobId')) return submitted_job
def test_context_id_stable_across_retries(self, mock_k8s_client, mock_load_config): mock_pod = mock_k8s_client().read_namespaced_pod.return_value mock_pod.metadata.annotations = { 'workflows.argoproj.io/node-name': 'node-1' } ctx1 = KfpExecutionContext() ctx2 = KfpExecutionContext() self.assertEqual(ctx1.context_id(), ctx2.context_id())
def wait_job(project_id, job_id, wait_interval=30): """Waits a MLEngine job. Args: project_id (str): Required. The ID of the parent project of the job. job_id (str): Required. The ID of the job to wait. wait_interval (int): optional wait interval between calls to get job status. Defaults to 30. Outputs: /tmp/kfp/output/ml_engine/job.json: The json payload of the waiting job. /tmp/kfp/output/ml_engine/job_id.txt: The ID of the waiting job. /tmp/kfp/output/ml_engine/job_dir.txt: The `jobDir` of the waiting job. """ ml_client = MLEngineClient() with KfpExecutionContext( on_cancel=lambda: cancel_job(ml_client, project_id, job_id)): return wait_for_job_done(ml_client, project_id, job_id, wait_interval)
def execute_and_wait(self): with KfpExecutionContext(on_cancel=self._cancel) as ctx: self._set_version_name(ctx.context_id()) self._dump_metadata() existing_version = wait_existing_version(self._ml, self._version_name, self._wait_interval) if existing_version and self._is_dup_version(existing_version): return self._handle_completed_version(existing_version) if existing_version and self._replace_existing: logging.info('Deleting existing version...') self._delete_version_and_wait() elif existing_version: raise RuntimeError( 'Existing version conflicts with the name of the new version.') created_version = self._create_version_and_wait() return self._handle_completed_version(created_version)
def execute_and_wait(self): with KfpExecutionContext(on_cancel=self._cancel): existing_version = wait_existing_version(self._ml, self._version_name, self._wait_interval) if not existing_version: logging.info('The version has already been deleted.') return None logging.info('Deleting existing version...') operation = self._ml.delete_version(self._version_name) # Cache operation name for cancellation. self._delete_operation_name = operation.get('name') try: wait_for_operation_done(self._ml, self._delete_operation_name, 'delete version', self._wait_interval) finally: self._delete_operation_name = None return None
def execute(self): with KfpExecutionContext() as ctx: self._set_model_name(ctx.context_id()) self._dump_metadata() try: created_model = self._ml.create_model( project_id=self._project_id, model=self._model) except errors.HttpError as e: if e.resp.status == 409: existing_model = self._ml.get_model(self._model_name) if not self._is_dup_model(existing_model): raise logging.info('The same model {} has been submitted' ' before. Continue the operation.'.format( self._model_name)) created_model = existing_model else: raise self._dump_model(created_model) return created_model
def submit_job(project_id, region, cluster_name, job, wait_interval=30): """Submits a Cloud Dataproc job. Args: project_id (str): Required. The ID of the Google Cloud Platform project that the cluster belongs to. region (str): Required. The Cloud Dataproc region in which to handle the request. cluster_name (str): Required. The cluster to run the job. job (dict): Optional. The full payload of a [Dataproc job]( https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). wait_interval (int): The wait seconds between polling the operation. Defaults to 30s. Returns: The created job payload. Output Files: $KFP_OUTPUT_PATH/dataproc/job_id.txt: The ID of the created job. """ if 'reference' not in job: job['reference'] = {} job['reference']['projectId'] = project_id if 'placement' not in job: job['placement'] = {} job['placement']['clusterName'] = cluster_name client = DataprocClient() operation_name = None with KfpExecutionContext( on_cancel=lambda: client.cancel_operation(operation_name)) as ctx: submitted_job = client.submit_job(project_id, region, job, request_id=ctx.context_id()) _dump_metadata(submitted_job, region) submitted_job = _wait_for_job_done(client, project_id, region, submitted_job['reference']['jobId'], wait_interval) return _dump_job(submitted_job)
def delete_cluster(project_id, region, name, wait_interval=30): """Deletes a DataProc cluster. Args: project_id (str): Required. The ID of the Google Cloud Platform project that the cluster belongs to. region (str): Required. The Cloud Dataproc region in which to handle the request. name (str): Required. The cluster name to delete. wait_interval (int): The wait seconds between polling the operation. Defaults to 30s. """ client = DataprocClient() operation_name = None with KfpExecutionContext( on_cancel=lambda: client.cancel_operation(operation_name)) as ctx: operation = client.delete_cluster(project_id, region, name, request_id=ctx.context_id()) operation_name = operation.get('name') return client.wait_for_operation_done(operation_name, wait_interval)
def launch_flex_template( project_id, location, launch_parameters, validate_only=False, staging_dir=None, wait_interval=30, job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt', job_object_output_path='/tmp/kfp/output/dataflow/job.json', ): """Launches a dataflow job from a flex template. Args: project_id (str): Required. The ID of the Cloud Platform project that the job belongs to. location (str): The regional endpoint to which to direct the request. launch_parameters (dict): Parameters to provide to the template being launched. Schema defined in https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#LaunchFlexTemplateParameter. `jobName` will be replaced by generated name. validate_only (boolean): If true, the request is validated but not actually executed. Defaults to false. staging_dir (str): Optional. The GCS directory for keeping staging files. A random subdirectory will be created under the directory to keep job info for resuming the job in case of failure. wait_interval (int): The wait seconds between polling. job_id_output_path (str): Optional. Output file to save job_id of execution job_object_output_path (str): Optional. Output file to save job details of execution Returns: The completed job. """ storage_client = storage.Client() df_client = DataflowClient() job_id = None def cancel(): if job_id: df_client.cancel_job(project_id, job_id, location) with KfpExecutionContext(on_cancel=cancel) as ctx: staging_location = get_staging_location(staging_dir, ctx.context_id()) job_id, _ = read_job_id_and_location(storage_client, staging_location) # Continue waiting for the job if it's has been uploaded to staging location. if job_id: job = df_client.get_job(project_id, job_id, location) job = wait_and_dump_job( df_client, project_id, location, job, wait_interval, job_id_output_path=job_id_output_path, job_object_output_path=job_object_output_path, ) logging.info(f'Skipping, existing job: {job}') return job if launch_parameters is None: launch_parameters = {} request_body = { 'launchParameter': launch_parameters, 'validateOnly': validate_only } request_body['launchParameter']['jobName'] = 'job-' + ctx.context_id() response = df_client.launch_flex_template( project_id, request_body, location ) job = response.get('job', None) if not job: # Validate only mode return job job_id = job.get('id') upload_job_id_and_location( storage_client, staging_location, job_id, location ) job = wait_and_dump_job( df_client, project_id, location, job, wait_interval, job_id_output_path=job_id_output_path, job_object_output_path=job_object_output_path, ) logging.info(f'Completed job: {job}') return job
def launch_template(project_id, gcs_path, launch_parameters, location=None, validate_only=None, staging_dir=None, wait_interval=30): """Launchs a dataflow job from template. Args: project_id (str): Required. The ID of the Cloud Platform project that the job belongs to. gcs_path (str): Required. A Cloud Storage path to the template from which to create the job. Must be valid Cloud Storage URL, beginning with 'gs://'. launch_parameters (dict): Parameters to provide to the template being launched. Schema defined in https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters. `jobName` will be replaced by generated name. location (str): The regional endpoint to which to direct the request. validate_only (boolean): If true, the request is validated but not actually executed. Defaults to false. staging_dir (str): Optional. The GCS directory for keeping staging files. A random subdirectory will be created under the directory to keep job info for resuming the job in case of failure. wait_interval (int): The wait seconds between polling. Returns: The completed job. """ storage_client = storage.Client() df_client = DataflowClient() job_id = None def cancel(): if job_id: df_client.cancel_job(project_id, job_id, location) with KfpExecutionContext(on_cancel=cancel) as ctx: staging_location = get_staging_location(staging_dir, ctx.context_id()) job_id, _ = read_job_id_and_location(storage_client, staging_location) # Continue waiting for the job if it's has been uploaded to staging location. if job_id: job = df_client.get_job(project_id, job_id, location) return wait_and_dump_job(df_client, project_id, location, job, wait_interval) if not launch_parameters: launch_parameters = {} launch_parameters['jobName'] = 'job-' + ctx.context_id() response = df_client.launch_template(project_id, gcs_path, location, validate_only, launch_parameters) job = response.get('job', None) if not job: # Validate only mode return job job_id = job.get('id') upload_job_id_and_location(storage_client, staging_location, job_id, location) return wait_and_dump_job(df_client, project_id, location, job, wait_interval)
def query( query, project_id, dataset_id=None, table_id=None, output_gcs_path=None, dataset_location='US', job_config=None, output_path=None, output_filename=None, output_destination_format="CSV", job_object_output_path='/tmp/kfp/output/bigquery/query-job.json', output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt', output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt', output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt', ): """Submit a query to Bigquery service and dump outputs to Bigquery table or a GCS blob. Args: query (str): The query used by Bigquery service to fetch the results. project_id (str): The project to execute the query job. dataset_id (str): The ID of the persistent dataset to keep the results of the query. If the dataset does not exist, the operation will create a new one. table_id (str): The ID of the table to keep the results of the query. If absent, the operation will generate a random id for the table. output_gcs_path (str): The GCS blob path to dump the query results to. dataset_location (str): The location to create the dataset. Defaults to `US`. job_config (dict): The full config spec for the query job. output_path (str): The path to where query result will be stored output_filename (str): The name of the file where the results will be stored output_destination_format (str): The name of the output destination format. Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO. Returns: The API representation of the completed query job. """ client = bigquery.Client(project=project_id, location=dataset_location) if not job_config: job_config = bigquery.QueryJobConfig() job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE else: job_config = bigquery.QueryJobConfig.from_api_repr(job_config) job_id = None def cancel(): if job_id: client.cancel_job(job_id) with KfpExecutionContext(on_cancel=cancel) as ctx: job_id = 'query_' + ctx.context_id() query_job = _get_job(client, job_id) table_ref = None if not query_job: dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location) if dataset_ref: if not table_id: table_id = job_id table_ref = dataset_ref.table(table_id) job_config.destination = table_ref gcp_common.dump_file(output_dataset_id_output_path, table_ref.dataset_id) gcp_common.dump_file(output_table_id_output_path, table_ref.table_id) query_job = client.query(query, job_config, job_id=job_id) _display_job_link(project_id, job_id) if output_path != None: #Write to local file result = query_job.result() if not os.path.exists(output_path): os.makedirs(output_path) df = result.to_dataframe() df.to_csv(os.path.join(output_path, output_filename)) else: query_job.result() if output_gcs_path: job_id = 'extract_' + ctx.context_id() extract_job = _get_job(client, job_id) logging.info('Extracting data from table {} to {}.'.format( str(table_ref), output_gcs_path)) if not extract_job: job_config = ExtractJobConfig( destination_format=output_destination_format) extract_job = client.extract_table(table_ref, output_gcs_path, job_config=job_config) extract_job.result() # Wait for export to finish # TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo/pull/1653 gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path or '-') gcp_common.dump_file(job_object_output_path, json.dumps(query_job.to_api_repr())) return query_job.to_api_repr()
def test_init_succeed_without_pod_name(self, mock_k8s_client, mock_load_config): with KfpExecutionContext() as ctx: self.assertFalse(ctx.under_kfp_environment()) pass
def create_cluster( project_id, region, name=None, name_prefix=None, initialization_actions=None, config_bucket=None, image_version=None, cluster=None, wait_interval=30, cluster_name_output_path='/tmp/kfp/output/dataproc/cluster_name.txt', cluster_object_output_path='/tmp/kfp/output/dataproc/cluster.json', ): """Creates a DataProc cluster under a project. Args: project_id (str): Required. The ID of the Google Cloud Platform project that the cluster belongs to. region (str): Required. The Cloud Dataproc region in which to handle the request. name (str): Optional. The cluster name. Cluster names within a project must be unique. Names of deleted clusters can be reused. name_prefix (str): Optional. The prefix of the cluster name. initialization_actions (list): Optional. List of GCS URIs of executables to execute on each node after config is completed. By default, executables are run on master and all worker nodes. config_bucket (str): Optional. A Google Cloud Storage bucket used to stage job dependencies, config files, and job driver console output. image_version (str): Optional. The version of software inside the cluster. cluster (dict): Optional. The full cluster config. See [full details]( https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster) wait_interval (int): The wait seconds between polling the operation. Defaults to 30s. Returns: The created cluster object. Output Files: $KFP_OUTPUT_PATH/dataproc/cluster_name.txt: The cluster name of the created cluster. """ if not cluster: cluster = {} cluster['projectId'] = project_id if 'config' not in cluster: cluster['config'] = {} if name: cluster['clusterName'] = name if initialization_actions: cluster['config']['initializationActions'] = list( map(lambda file: {'executableFile': file}, initialization_actions)) if config_bucket: cluster['config']['configBucket'] = config_bucket if image_version: if 'softwareConfig' not in cluster['config']: cluster['config']['softwareConfig'] = {} cluster['config']['softwareConfig']['imageVersion'] = image_version client = DataprocClient() operation_name = None with KfpExecutionContext( on_cancel=lambda: client.cancel_operation(operation_name)) as ctx: _set_cluster_name(cluster, ctx.context_id(), name_prefix) _dump_metadata(cluster, region) operation = client.create_cluster(project_id, region, cluster, request_id=ctx.context_id()) operation_name = operation.get('name') operation = client.wait_for_operation_done(operation_name, wait_interval) cluster = operation.get('response') gcp_common.dump_file(cluster_object_output_path, json.dumps(cluster)) gcp_common.dump_file(cluster_name_output_path, cluster.get('clusterName')) return cluster
def launch_python( python_file_path, project_id, staging_dir=None, requirements_file_path=None, args=[], wait_interval=30, job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt', job_object_output_path='/tmp/kfp/output/dataflow/job.json', ): """Launch a self-executing beam python file. Args: python_file_path (str): The gcs or local path to the python file to run. project_id (str): The ID of the parent project. staging_dir (str): Optional. The GCS directory for keeping staging files. A random subdirectory will be created under the directory to keep job info for resuming the job in case of failure and it will be passed as `staging_location` and `temp_location` command line args of the beam code. requirements_file_path (str): Optional, the gcs or local path to the pip requirements file. args (list): The list of args to pass to the python file. wait_interval (int): The wait seconds between polling. Returns: The completed job. """ storage_client = storage.Client() df_client = DataflowClient() job_id = None location = None def cancel(): if job_id: df_client.cancel_job(project_id, job_id, location) with KfpExecutionContext(on_cancel=cancel) as ctx: staging_location = get_staging_location(staging_dir, ctx.context_id()) job_id, location = read_job_id_and_location(storage_client, staging_location) # Continue waiting for the job if it's has been uploaded to staging location. if job_id: job = df_client.get_job(project_id, job_id, location) return wait_and_dump_job( df_client, project_id, location, job, wait_interval, job_id_output_path=job_id_output_path, job_object_output_path=job_object_output_path, ) _install_requirements(requirements_file_path) python_file_path = stage_file(python_file_path) cmd = _prepare_cmd(project_id, python_file_path, args, staging_location) sub_process = Process(cmd) for line in sub_process.read_lines(): job_id, location = _extract_job_id_and_location(line) if job_id: logging.info('Found job id {} and location {}.'.format( job_id, location)) upload_job_id_and_location(storage_client, staging_location, job_id, location) break sub_process.wait_and_check() if not job_id: logging.warning('No dataflow job was found when ' 'running the python file.') return None job = df_client.get_job(project_id, job_id, location=location) return wait_and_dump_job( df_client, project_id, location, job, wait_interval, job_id_output_path=job_id_output_path, job_object_output_path=job_object_output_path, )