Exemple #1
0
    def _collateral(
        self,
        project_uuid: str,
        task_id: str,
        pipeline: Pipeline,
        run_config: Dict[str, Any],
        env_variables: Dict[str, Any],
        **kwargs,
    ):
        # Get docker ids of images to use and make it so that the images
        # will not be deleted in case they become outdated by an
        # environment rebuild.
        try:
            env_uuid_docker_id_mappings = lock_environment_images_for_run(
                task_id,
                project_uuid,
                pipeline.get_environments(),
            )
        except errors.ImageNotFound as e:
            msg = (
                "Pipeline references environments that do not exist in the"
                f" project, the following environments do not exist: [{e}].\n\n"
                "Please make sure all pipeline steps are assigned an"
                " environment that exists in the project."
            )
            raise errors.ImageNotFound(msg)

        # Create Celery object with the Flask context and construct the
        # kwargs for the job.
        celery = make_celery(current_app)
        run_config["env_uuid_docker_id_mappings"] = env_uuid_docker_id_mappings
        run_config["user_env_variables"] = env_variables
        celery_job_kwargs = {
            "pipeline_definition": pipeline.to_dict(),
            "project_uuid": project_uuid,
            "run_config": run_config,
        }

        # Start the run as a background task on Celery. Due to circular
        # imports we send the task by name instead of importing the
        # function directly.
        res = celery.send_task(
            "app.core.tasks.run_pipeline",
            kwargs=celery_job_kwargs,
            task_id=task_id,
        )

        # NOTE: this is only if a backend is configured.  The task does
        # not return anything. Therefore we can forget its result and
        # make sure that the Celery backend releases recourses (for
        # storing and transmitting results) associated to the task.
        # Uncomment the line below if applicable.
        res.forget()
def testio(request):
    full_name = f"tests/input_execution_order/{request.param}"
    with open(full_name, "r") as f:
        description = json.load(f)

    pipeline = Pipeline.from_json(description)
    return IO(pipeline)
Exemple #3
0
def run_partial(self,
                pipeline_description: PipelineDescription,
                run_config: Dict[str, Union[str, Dict[str, str]]],
                task_id: Optional[str] = None) -> str:
    """Runs a pipeline partially.

    A partial run is described by the pipeline description The
    call-order of the steps is always preserved, e.g. a --> b then a
    will always be run before b.

    Args:
        pipeline_description: a json description of the pipeline.
        run_config: configuration of the run for the compute backend.

    Returns:
        Status of the pipeline run. "FAILURE" or "SUCCESS".

    """
    # Get the pipeline to run.
    pipeline = Pipeline.from_json(pipeline_description)

    # TODO: don't think this task_id is needed anymore. It was
    #       introduced as part of the scheduled runs which we don't use
    #       anymore.
    # Run the subgraph in parallel. And pass the id of the AsyncResult
    # object.
    # TODO: The commented line below is once we can introduce sessions.
    # session = run_partial.session
    task_id = task_id if task_id is not None else self.request.id
    return asyncio.run(pipeline.run(task_id, run_config=run_config))
def testio(request):
    full_name = f'tests/input_execution_order/{request.param}'
    with open(full_name, 'r') as f:
        description = json.load(f)

    pipeline = Pipeline.from_json(description)
    correct_execution_order = description['correct_execution_order']
    return IO(pipeline, correct_execution_order)
Exemple #5
0
def run_pipeline(
    self,
    pipeline_definition: PipelineDefinition,
    project_uuid: str,
    run_config: Dict[str, Union[str, Dict[str, str]]],
    task_id: Optional[str] = None,
) -> str:
    """Runs a pipeline partially.

    A partial run is described by the pipeline definition The
    call-order of the steps is always preserved, e.g. a --> b then a
    will always be run before b.

    Args:
        pipeline_definition: a json description of the pipeline.
        run_config: configuration of the run for the compute backend.
            Example: {
                'run_endpoint': 'runs',
                'project_dir': '/home/../pipelines/uuid',
                'env_uuid_docker_id_mappings': {
                    'b6527b0b-bfcc-4aff-91d1-37f9dfd5d8e8':
                        'sha256:61f82126945bb25dd85d6a5b122a1815df1c0c5f91621089cde0938be4f698d4'
                }
            }

    Returns:
        Status of the pipeline run. "FAILURE" or "SUCCESS".

    """
    run_config["pipeline_uuid"] = pipeline_definition["uuid"]
    run_config["project_uuid"] = project_uuid

    # Get the pipeline to run.
    pipeline = Pipeline.from_json(pipeline_definition)

    # TODO: don't think this task_id is needed anymore. It was
    #       introduced as part of the scheduled runs which we don't use
    #       anymore.
    # Run the subgraph in parallel. And pass the id of the AsyncResult
    # object.
    # TODO: The commented line below is once we can introduce sessions.
    # session = run_pipeline.session
    task_id = task_id if task_id is not None else self.request.id

    # TODO: could make the celery task fail in case the pipeline run
    # failed. Although the run did complete successfully from a task
    # scheduler perspective.
    # https://stackoverflow.com/questions/7672327/how-to-make-a-celery-task-fail-from-within-the-task
    return asyncio.run(run_pipeline_async(run_config, pipeline, task_id))
Exemple #6
0
    def _collateral(
        self,
        project_uuid: str,
        task_id: str,
        pipeline: Pipeline,
        run_config: Dict[str, Any],
        env_variables: Dict[str, Any],
        env_uuid_to_image: Dict[str, str],
        **kwargs,
    ):

        # Create Celery object with the Flask context and construct the
        # kwargs for the job.
        celery = make_celery(current_app)
        run_config["env_uuid_to_image"] = env_uuid_to_image
        run_config["user_env_variables"] = env_variables
        run_config["session_uuid"] = (
            project_uuid[:18] + pipeline.properties["uuid"][:18]
        )
        run_config["session_type"] = "interactive"
        celery_job_kwargs = {
            "pipeline_definition": pipeline.to_dict(),
            "run_config": run_config,
            "session_uuid": run_config["session_uuid"],
        }

        # Start the run as a background task on Celery. Due to circular
        # imports we send the task by name instead of importing the
        # function directly.
        res = celery.send_task(
            "app.core.tasks.run_pipeline",
            kwargs=celery_job_kwargs,
            task_id=task_id,
        )

        # NOTE: this is only if a backend is configured.  The task does
        # not return anything. Therefore we can forget its result and
        # make sure that the Celery backend releases recourses (for
        # storing and transmitting results) associated to the task.
        # Uncomment the line below if applicable.
        res.forget()
Exemple #7
0
def run_pipeline(
    self,
    pipeline_definition: PipelineDefinition,
    run_config: RunConfig,
    session_uuid: str,
    task_id: Optional[str] = None,
) -> str:
    """Runs a pipeline partially.

    A partial run is described by the pipeline definition The
    call-order of the steps is always preserved, e.g. a --> b then a
    will always be run before b.

    Args:
        pipeline_definition: a json description of the pipeline.
        run_config: configuration of the run for the compute backend.

    Returns:
        Status of the pipeline run. "FAILURE" or "SUCCESS".

    """
    # Get the pipeline to run.
    pipeline = Pipeline.from_json(pipeline_definition)

    # TODO: don't think this task_id is needed anymore. It was
    #       introduced as part of the scheduled runs which we don't use
    #       anymore.
    # Run the subgraph in parallel. And pass the id of the AsyncResult
    # object.
    # TODO: The commented line below is once we can introduce sessions.
    # session = run_pipeline.session
    task_id = task_id if task_id is not None else self.request.id

    # TODO: could make the celery task fail in case the pipeline run
    # failed. Although the run did complete successfully from a task
    # scheduler perspective.
    # https://stackoverflow.com/questions/7672327/how-to-make-a-celery-task-fail-from-within-the-task
    return asyncio.run(run_pipeline_async(session_uuid, run_config, pipeline, task_id))
Exemple #8
0
def pipeline():
    with open('tests/input_operations/pipeline.json', 'r') as f:
        description = json.load(f)

    pipeline = Pipeline.from_json(description)
    return pipeline
Exemple #9
0
def test_serialization():
    with open("tests/input_operations/pipeline.json", "r") as f:
        description = json.load(f)

    pipeline = Pipeline.from_json(description)
    assert pipeline.to_dict() == description
Exemple #10
0
    def _transaction(
        self,
        project_uuid: str,
        run_config: Dict[str, Any],
        pipeline: Pipeline,
    ):
        # specify the task_id beforehand to avoid race conditions
        # between the task and its presence in the db
        task_id = str(uuid.uuid4())

        # NOTE: we are setting the status of the run ourselves without
        # using the option of celery to get the status of tasks. This
        # way we do not have to configure a backend (where the default
        # of "rpc://" does not give the results we would want).
        run = {
            "uuid": task_id,
            "pipeline_uuid": pipeline.properties["uuid"],
            "project_uuid": project_uuid,
            "status": "PENDING",
        }
        db.session.add(models.InteractivePipelineRun(**run))
        # need to flush because otherwise the bulk insertion of pipeline
        # steps will lead to foreign key errors
        # https://docs.sqlalchemy.org/en/13/orm/persistence_techniques.html#bulk-operations-caveats
        db.session.flush()

        # Set an initial value for the status of the pipeline steps that
        # will be run.
        step_uuids = [s.properties["uuid"] for s in pipeline.steps]

        pipeline_steps = []
        for step_uuid in step_uuids:
            pipeline_steps.append(
                models.PipelineRunStep(
                    **{
                        "run_uuid": task_id,
                        "step_uuid": step_uuid,
                        "status": "PENDING",
                    }
                )
            )
        db.session.bulk_save_objects(pipeline_steps)
        run["pipeline_steps"] = pipeline_steps

        try:
            env_uuid_to_image = environments.lock_environment_images_for_run(
                task_id,
                project_uuid,
                pipeline.get_environments(),
            )
        except self_errors.PipelineDefinitionNotValid:
            msg = "Please make sure every pipeline step is assigned an environment."
            raise self_errors.PipelineDefinitionNotValid(msg)

        self.collateral_kwargs["project_uuid"] = project_uuid
        self.collateral_kwargs["task_id"] = task_id
        self.collateral_kwargs["pipeline"] = pipeline
        self.collateral_kwargs["run_config"] = run_config
        self.collateral_kwargs["env_variables"] = get_proj_pip_env_variables(
            project_uuid, pipeline.properties["uuid"]
        )
        self.collateral_kwargs["env_uuid_to_image"] = env_uuid_to_image
        return run