def _collateral( self, project_uuid: str, task_id: str, pipeline: Pipeline, run_config: Dict[str, Any], env_variables: Dict[str, Any], **kwargs, ): # Get docker ids of images to use and make it so that the images # will not be deleted in case they become outdated by an # environment rebuild. try: env_uuid_docker_id_mappings = lock_environment_images_for_run( task_id, project_uuid, pipeline.get_environments(), ) except errors.ImageNotFound as e: msg = ( "Pipeline references environments that do not exist in the" f" project, the following environments do not exist: [{e}].\n\n" "Please make sure all pipeline steps are assigned an" " environment that exists in the project." ) raise errors.ImageNotFound(msg) # Create Celery object with the Flask context and construct the # kwargs for the job. celery = make_celery(current_app) run_config["env_uuid_docker_id_mappings"] = env_uuid_docker_id_mappings run_config["user_env_variables"] = env_variables celery_job_kwargs = { "pipeline_definition": pipeline.to_dict(), "project_uuid": project_uuid, "run_config": run_config, } # Start the run as a background task on Celery. Due to circular # imports we send the task by name instead of importing the # function directly. res = celery.send_task( "app.core.tasks.run_pipeline", kwargs=celery_job_kwargs, task_id=task_id, ) # NOTE: this is only if a backend is configured. The task does # not return anything. Therefore we can forget its result and # make sure that the Celery backend releases recourses (for # storing and transmitting results) associated to the task. # Uncomment the line below if applicable. res.forget()
def testio(request): full_name = f"tests/input_execution_order/{request.param}" with open(full_name, "r") as f: description = json.load(f) pipeline = Pipeline.from_json(description) return IO(pipeline)
def run_partial(self, pipeline_description: PipelineDescription, run_config: Dict[str, Union[str, Dict[str, str]]], task_id: Optional[str] = None) -> str: """Runs a pipeline partially. A partial run is described by the pipeline description The call-order of the steps is always preserved, e.g. a --> b then a will always be run before b. Args: pipeline_description: a json description of the pipeline. run_config: configuration of the run for the compute backend. Returns: Status of the pipeline run. "FAILURE" or "SUCCESS". """ # Get the pipeline to run. pipeline = Pipeline.from_json(pipeline_description) # TODO: don't think this task_id is needed anymore. It was # introduced as part of the scheduled runs which we don't use # anymore. # Run the subgraph in parallel. And pass the id of the AsyncResult # object. # TODO: The commented line below is once we can introduce sessions. # session = run_partial.session task_id = task_id if task_id is not None else self.request.id return asyncio.run(pipeline.run(task_id, run_config=run_config))
def testio(request): full_name = f'tests/input_execution_order/{request.param}' with open(full_name, 'r') as f: description = json.load(f) pipeline = Pipeline.from_json(description) correct_execution_order = description['correct_execution_order'] return IO(pipeline, correct_execution_order)
def run_pipeline( self, pipeline_definition: PipelineDefinition, project_uuid: str, run_config: Dict[str, Union[str, Dict[str, str]]], task_id: Optional[str] = None, ) -> str: """Runs a pipeline partially. A partial run is described by the pipeline definition The call-order of the steps is always preserved, e.g. a --> b then a will always be run before b. Args: pipeline_definition: a json description of the pipeline. run_config: configuration of the run for the compute backend. Example: { 'run_endpoint': 'runs', 'project_dir': '/home/../pipelines/uuid', 'env_uuid_docker_id_mappings': { 'b6527b0b-bfcc-4aff-91d1-37f9dfd5d8e8': 'sha256:61f82126945bb25dd85d6a5b122a1815df1c0c5f91621089cde0938be4f698d4' } } Returns: Status of the pipeline run. "FAILURE" or "SUCCESS". """ run_config["pipeline_uuid"] = pipeline_definition["uuid"] run_config["project_uuid"] = project_uuid # Get the pipeline to run. pipeline = Pipeline.from_json(pipeline_definition) # TODO: don't think this task_id is needed anymore. It was # introduced as part of the scheduled runs which we don't use # anymore. # Run the subgraph in parallel. And pass the id of the AsyncResult # object. # TODO: The commented line below is once we can introduce sessions. # session = run_pipeline.session task_id = task_id if task_id is not None else self.request.id # TODO: could make the celery task fail in case the pipeline run # failed. Although the run did complete successfully from a task # scheduler perspective. # https://stackoverflow.com/questions/7672327/how-to-make-a-celery-task-fail-from-within-the-task return asyncio.run(run_pipeline_async(run_config, pipeline, task_id))
def _collateral( self, project_uuid: str, task_id: str, pipeline: Pipeline, run_config: Dict[str, Any], env_variables: Dict[str, Any], env_uuid_to_image: Dict[str, str], **kwargs, ): # Create Celery object with the Flask context and construct the # kwargs for the job. celery = make_celery(current_app) run_config["env_uuid_to_image"] = env_uuid_to_image run_config["user_env_variables"] = env_variables run_config["session_uuid"] = ( project_uuid[:18] + pipeline.properties["uuid"][:18] ) run_config["session_type"] = "interactive" celery_job_kwargs = { "pipeline_definition": pipeline.to_dict(), "run_config": run_config, "session_uuid": run_config["session_uuid"], } # Start the run as a background task on Celery. Due to circular # imports we send the task by name instead of importing the # function directly. res = celery.send_task( "app.core.tasks.run_pipeline", kwargs=celery_job_kwargs, task_id=task_id, ) # NOTE: this is only if a backend is configured. The task does # not return anything. Therefore we can forget its result and # make sure that the Celery backend releases recourses (for # storing and transmitting results) associated to the task. # Uncomment the line below if applicable. res.forget()
def run_pipeline( self, pipeline_definition: PipelineDefinition, run_config: RunConfig, session_uuid: str, task_id: Optional[str] = None, ) -> str: """Runs a pipeline partially. A partial run is described by the pipeline definition The call-order of the steps is always preserved, e.g. a --> b then a will always be run before b. Args: pipeline_definition: a json description of the pipeline. run_config: configuration of the run for the compute backend. Returns: Status of the pipeline run. "FAILURE" or "SUCCESS". """ # Get the pipeline to run. pipeline = Pipeline.from_json(pipeline_definition) # TODO: don't think this task_id is needed anymore. It was # introduced as part of the scheduled runs which we don't use # anymore. # Run the subgraph in parallel. And pass the id of the AsyncResult # object. # TODO: The commented line below is once we can introduce sessions. # session = run_pipeline.session task_id = task_id if task_id is not None else self.request.id # TODO: could make the celery task fail in case the pipeline run # failed. Although the run did complete successfully from a task # scheduler perspective. # https://stackoverflow.com/questions/7672327/how-to-make-a-celery-task-fail-from-within-the-task return asyncio.run(run_pipeline_async(session_uuid, run_config, pipeline, task_id))
def pipeline(): with open('tests/input_operations/pipeline.json', 'r') as f: description = json.load(f) pipeline = Pipeline.from_json(description) return pipeline
def test_serialization(): with open("tests/input_operations/pipeline.json", "r") as f: description = json.load(f) pipeline = Pipeline.from_json(description) assert pipeline.to_dict() == description
def _transaction( self, project_uuid: str, run_config: Dict[str, Any], pipeline: Pipeline, ): # specify the task_id beforehand to avoid race conditions # between the task and its presence in the db task_id = str(uuid.uuid4()) # NOTE: we are setting the status of the run ourselves without # using the option of celery to get the status of tasks. This # way we do not have to configure a backend (where the default # of "rpc://" does not give the results we would want). run = { "uuid": task_id, "pipeline_uuid": pipeline.properties["uuid"], "project_uuid": project_uuid, "status": "PENDING", } db.session.add(models.InteractivePipelineRun(**run)) # need to flush because otherwise the bulk insertion of pipeline # steps will lead to foreign key errors # https://docs.sqlalchemy.org/en/13/orm/persistence_techniques.html#bulk-operations-caveats db.session.flush() # Set an initial value for the status of the pipeline steps that # will be run. step_uuids = [s.properties["uuid"] for s in pipeline.steps] pipeline_steps = [] for step_uuid in step_uuids: pipeline_steps.append( models.PipelineRunStep( **{ "run_uuid": task_id, "step_uuid": step_uuid, "status": "PENDING", } ) ) db.session.bulk_save_objects(pipeline_steps) run["pipeline_steps"] = pipeline_steps try: env_uuid_to_image = environments.lock_environment_images_for_run( task_id, project_uuid, pipeline.get_environments(), ) except self_errors.PipelineDefinitionNotValid: msg = "Please make sure every pipeline step is assigned an environment." raise self_errors.PipelineDefinitionNotValid(msg) self.collateral_kwargs["project_uuid"] = project_uuid self.collateral_kwargs["task_id"] = task_id self.collateral_kwargs["pipeline"] = pipeline self.collateral_kwargs["run_config"] = run_config self.collateral_kwargs["env_variables"] = get_proj_pip_env_variables( project_uuid, pipeline.properties["uuid"] ) self.collateral_kwargs["env_uuid_to_image"] = env_uuid_to_image return run