def run(self, pipeline: pipeline_py.Pipeline) -> None: """Runs given logical pipeline locally. Args: pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return for component in pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access pipeline.pipeline_info.pipeline_root) c = compiler.Compiler() pipeline = c.compile(pipeline) # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: datetime.datetime.now().isoformat(), }) deployment_config = runner_utils.extract_local_deployment_config(pipeline) connection_config = deployment_config.metadata_connection_config logging.info('Running pipeline:\n %s', pipeline) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec) logging.info('Component %s is running.', node_id) component_launcher.launch() logging.info('Component %s is finished.', node_id)
def run(self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None) -> "airflow.DAG": """Deploys given logical pipeline on Airflow. Args: pipeline: Logical pipeline containing pipeline args and comps. run_name: Optional name for the run. Returns: An Airflow DAG. """ # Only import these when needed. import airflow # noqa from zenml.integrations.airflow.orchestrators import airflow_component # Merge airflow-specific configs with pipeline args airflow_dag = airflow.DAG( dag_id=pipeline.pipeline_info.pipeline_name, **(typing.cast(AirflowPipelineConfig, self._config).airflow_dag_config), is_paused_upon_creation=False, catchup=False, # no backfill ) if "tmp_dir" not in pipeline.additional_pipeline_args: tmp_dir = os.path.join(pipeline.pipeline_info.pipeline_root, ".temp", "") pipeline.additional_pipeline_args["tmp_dir"] = tmp_dir for component in pipeline.components: if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( pipeline.pipeline_info.pipeline_root) self._replace_runtime_params(component) c = compiler.Compiler() pipeline = c.compile(pipeline) run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f") # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { "pipeline-run-id": run_name, }, ) deployment_config = runner_utils.extract_local_deployment_config( pipeline) connection_config = deployment_config.metadata_connection_config # type: ignore[attr-defined] # noqa component_impl_map = {} for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) current_airflow_component = airflow_component.AirflowComponent( parent_dag=airflow_dag, pipeline_node=pipeline_node, mlmd_connection=connection_config, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, ) component_impl_map[node_id] = current_airflow_component for upstream_node in node.pipeline_node.upstream_nodes: assert (upstream_node in component_impl_map ), "Components is not in topological order" current_airflow_component.set_upstream( component_impl_map[upstream_node]) return airflow_dag
def _extract_executor_spec(self, deployment_config: local_deployment_config_pb2. LocalDeploymentConfig, node_id: str) -> Optional[message.Message]: return runner_utils.extract_executor_spec(deployment_config, node_id)
def main(argv): # Log to the container's stdout so Kubeflow Pipelines UI can display logs to # the user. logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--pipeline_root', type=str, required=True) parser.add_argument('--metadata_ui_path', type=str, required=False, default='/mlpipeline-ui-metadata.json') parser.add_argument('--kubeflow_metadata_config', type=str, required=True) parser.add_argument('--tfx_ir', type=str, required=True) parser.add_argument('--node_id', type=str, required=True) # There might be multiple runtime parameters. # `args.runtime_parameter` should become List[str] by using "append". parser.add_argument('--runtime_parameter', type=str, action='append') # TODO(b/196892362): Replace hooking with a more straightforward mechanism. launcher._register_execution = _register_execution # pylint: disable=protected-access args = parser.parse_args(argv) tfx_ir = pipeline_pb2.Pipeline() json_format.Parse(args.tfx_ir, tfx_ir) _resolve_runtime_parameters(tfx_ir, args.runtime_parameter) deployment_config = runner_utils.extract_local_deployment_config(tfx_ir) kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig() json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config) metadata_connection = metadata.Metadata( _get_metadata_connection_config(kubeflow_metadata_config)) node_id = args.node_id # Attach necessary labels to distinguish different runner and DSL. # TODO(zhitaoli): Pass this from KFP runner side when the same container # entrypoint can be used by a different runner. with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_RUNNER: 'kfp', }): custom_executor_operators = { executable_spec_pb2.ContainerExecutableSpec: kubernetes_executor_operator.KubernetesExecutorOperator } executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) pipeline_node = _get_pipeline_node(tfx_ir, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata_connection, pipeline_info=tfx_ir.pipeline_info, pipeline_runtime_spec=tfx_ir.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, custom_executor_operators=custom_executor_operators) logging.info('Component %s is running.', node_id) execution_info = component_launcher.launch() logging.info('Component %s is finished.', node_id) # Dump the UI metadata. _dump_ui_metadata(pipeline_node, execution_info, args.metadata_ui_path)
def run( self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None ) -> None: """Runs given logical pipeline locally. Args: pipeline: Logical pipeline containing pipeline args and components. run_name: Optional name for the run. """ for component in pipeline.components: if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( pipeline.pipeline_info.pipeline_root ) c = compiler.Compiler() pipeline = c.compile(pipeline) run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f") # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { PIPELINE_RUN_ID_PARAMETER_NAME: run_name, }, ) deployment_config = runner_utils.extract_local_deployment_config( pipeline ) connection_config = deployment_config.metadata_connection_config # type: ignore[attr-defined] # noqa logger.debug(f"Using deployment config:\n {deployment_config}") logger.debug(f"Using connection config:\n {connection_config}") # Run each component. Note that the pipeline.components list is in # topological order. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id ) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id ) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, ) start = time.time() logger.info(f"Step `{node_id}` has started.") component_launcher.launch() end = time.time() logger.info( f"Step `{node_id}` has finished" f" in {format_timedelta_pretty(end - start)}." )
def main(): # Log to the container's stdout so Kubeflow Pipelines UI can display logs to # the user. logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--pipeline_root', type=str, required=True) parser.add_argument('--kubeflow_metadata_config', type=str, required=True) parser.add_argument('--serialized_component', type=str, required=True) parser.add_argument('--tfx_ir', type=str, required=True) parser.add_argument('--node_id', type=str, required=True) launcher._register_execution = _register_execution # pylint: disable=protected-access args = parser.parse_args() tfx_ir = pipeline_pb2.Pipeline() json_format.Parse(args.tfx_ir, tfx_ir) # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( tfx_ir, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: os.environ['WORKFLOW_ID'], }) deployment_config = runner_utils.extract_local_deployment_config(tfx_ir) kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig() json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config) metadata_connection = kubeflow_metadata_adapter.KubeflowMetadataAdapter( _get_metadata_connection_config(kubeflow_metadata_config)) node_id = args.node_id # Attach necessary labels to distinguish different runner and DSL. # TODO(zhitaoli): Pass this from KFP runner side when the same container # entrypoint can be used by a different runner. with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_RUNNER: 'kfp', }): custom_executor_operators = { executable_spec_pb2.ContainerExecutableSpec: kubernetes_executor_operator.KubernetesExecutorOperator } executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) pipeline_node = _get_pipeline_node(tfx_ir, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata_connection, pipeline_info=tfx_ir.pipeline_info, pipeline_runtime_spec=tfx_ir.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, custom_executor_operators=custom_executor_operators) logging.info('Component %s is running.', node_id) execution_info = component_launcher.launch() logging.info('Component %s is finished.', node_id) # Dump the UI metadata. _dump_ui_metadata(pipeline_node, execution_info)
def run_with_ir( self, pipeline: pipeline_pb2.Pipeline, run_options: Optional[pipeline_pb2.RunOptions] = None, ) -> None: """Runs given pipeline locally. Args: pipeline: Pipeline IR containing pipeline args and components. run_options: Optional args for the run. Raises: ValueError: If run_options is provided, and partial_run_options.from_nodes and partial_run_options.to_nodes are both empty. """ # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: datetime.datetime.now().isoformat(), }) deployment_config = runner_utils.extract_local_deployment_config(pipeline) connection_config = getattr( deployment_config.metadata_connection_config, deployment_config.metadata_connection_config.WhichOneof( 'connection_config')) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) if run_options: logging.info('Using run_options:\n %s', run_options) pr_opts = run_options.partial_run partial_run_utils.mark_pipeline( pipeline, from_nodes=pr_opts.from_nodes or None, to_nodes=pr_opts.to_nodes or None, snapshot_settings=pr_opts.snapshot_settings) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id if pipeline_node.execution_options.HasField('skip'): logging.info('Skipping component %s.', node_id) continue executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec) logging.info('Component %s is running.', node_id) if pipeline_node.execution_options.run.perform_snapshot: with metadata.Metadata(connection_config) as mlmd_handle: partial_run_utils.snapshot(mlmd_handle, pipeline) component_launcher.launch() logging.info('Component %s is finished.', node_id)