Ejemplo n.º 1
0
def test_manager_prepare(basedir, filenames_all, data_a, tmpdir):
    """Test the volume manager prepare method."""
    # -- Setup ----------------------------------------------------------------
    s1_dir = os.path.join(tmpdir, 's1')
    s0 = FileSystemStorage(basedir=basedir, identifier=DEFAULT_STORE)
    s1 = FileSystemStorage(basedir=s1_dir, identifier='s1')
    volumes = VolumeManager(
        stores=[s0.to_dict(), s1.to_dict()],
        files={f: [DEFAULT_STORE] for f in filenames_all}
    )
    # Case 1: Empty arguments.
    volumes.prepare(store=s0, inputs=[], outputs=[])
    # Case 2: No file copy.
    volumes.prepare(store=s0, inputs=['examples/'], outputs=['examples/'])
    assert len(os.listdir(basedir)) == 3
    assert len(os.listdir(s1_dir)) == 0
    for f in filenames_all:
        assert volumes.files[f] == [DEFAULT_STORE]
    # Case 3: Copy file between stores.
    volumes.prepare(store=s1, inputs=['A.json', 'docs/'], outputs=['results/A.json', 'docs/'])
    assert len(os.listdir(basedir)) == 3
    assert len(os.listdir(s1_dir)) == 3
    filename = os.path.join(s1_dir, 'A.json')
    assert os.path.isfile(filename)
    with s1.load('A.json').open() as f:
        assert json.load(f) == data_a
    assert volumes.files == {
        'docs/D.json': [DEFAULT_STORE, 's1'],
        'examples/data/data.json': [DEFAULT_STORE],
        'examples/C.json': [DEFAULT_STORE],
        'A.json': [DEFAULT_STORE, 's1'],
        'examples/B.json': [DEFAULT_STORE]
    }
Ejemplo n.º 2
0
def volume_manager(specs: List[Dict], runstore: StorageVolume,
                   runfiles: List[str]) -> VolumeManager:
    """Create an instance of the storage volume manager for a workflow run.

    Combines the volume store specifications in the workflow run confguration
    with the storage volume for the workflow run files.

    Parameters
    ----------
    specs: list of dict
        List of specifications (dictionary serializations) for storage volumes.
    runstore: flowserv.volume.base.StorageVolume
        Storage volume for run files.
    runfiles: list of string
        List of files that have been copied to the run store.

    Returns
    -------
    flowserv.volume.manager.VolumeManager
    """
    stores = [runstore.to_dict()]
    files = defaultdict(list)
    for f in runfiles:
        files[f].append(DEFAULT_STORE)
    for doc in specs:
        # Ignore stores that match the identifier of the runstore to avoid
        # overriding the run store information.
        if doc['id'] == runstore.identifier:
            continue
        stores.append(doc)
        for f in doc.get('files', []):
            files[f].append(doc['id'])
    return VolumeManager(stores=stores, files=files)
Ejemplo n.º 3
0
def test_manager_update(tmpdir):
    """Test the update method for the volume manager."""
    volumes = VolumeManager(
        stores=[
            FStore(basedir=tmpdir, identifier=DEFAULT_STORE),
            FStore(basedir=tmpdir, identifier='s1')
        ],
        files={'f1': [DEFAULT_STORE]}
    )
    default_store = volumes.get(identifier=DEFAULT_STORE)
    s1 = volumes.get(identifier='s1')
    assert volumes.files == {'f1': [DEFAULT_STORE]}
    volumes.update(files=['f1', 'f2'], store=s1)
    assert volumes.files == {'f1': ['s1'], 'f2': ['s1']}
    volumes.update(files=['f2'], store=default_store)
    assert volumes.files == {'f1': ['s1'], 'f2': [DEFAULT_STORE]}
    volumes.update(files=['f2'], store=s1)
    assert volumes.files == {'f1': ['s1'], 'f2': ['s1']}
Ejemplo n.º 4
0
def exec_workflow(steps: List[WorkflowStep], workers: WorkerPool,
                  volumes: VolumeManager, result: RunResult) -> RunResult:
    """Execute steps in a serial workflow.

    The workflow arguments are part of the execution context that is contained
    in the :class:`flowserv.controller.serial.workflow.result.RunResult`. The
    result object is used to maintain the results for executed workflow steps.

    Executes workflow steps in sequence. Terminates early if the execution
    of a workflow step returns a non-zero value. Uses the given worker
    factory to create workers for steps that are of class
    :class:`flowserv.model.workflow.step.ContainerStep`.

    Parameters
    ----------
    steps: list of flowserv.model.workflow.step.WorkflowStep
        Steps in the serial workflow that are executed in the given context.
    workers: flowserv.controller.worker.manager.WorkerPool, default=None
        Factory for :class:`flowserv.model.workflow.step.ContainerStep` steps.
    volumes: flowserv.volume.manager.VolumeManager
        Manager for storage volumes that are used by the different workers.
    result: flowserv.controller.serial.workflow.result.RunResult
        Collector for results from executed workflow steps. Contains the context
        within which the workflow is executed.

    Returns
    -------
    flowserv.controller.worker.result.RunResult
    """
    for step in steps:
        # Get the worker that is responsible for executing the workflow step.
        worker = workers.get(step)
        # Prepare the volume store that is associated with the worker.
        store = volumes.get(worker.volume)
        volumes.prepare(store=store, inputs=step.inputs, outputs=step.outputs)
        # Execute the workflow step and add the result to the overall workflow
        # result. Terminate if the step execution was not successful.
        r = worker.exec(step=step, context=result.context, store=store)
        result.add(r)
        if r.returncode != 0:
            break
        # Update volume manager with output files for the workflow step.
        volumes.update(store=store, files=step.outputs)
    return result
Ejemplo n.º 5
0
def test_manager_init(tmpdir):
    """Test edge cases for the volume manager initialization."""
    default_store = FStore(basedir=tmpdir, identifier=DEFAULT_STORE)
    # Ensure we can instantiate the volume manager if a default store is given.
    volumes = VolumeManager(stores=[default_store])
    assert volumes.files == dict()
    volumes = VolumeManager(stores=[default_store], files={'f1': [DEFAULT_STORE]})
    assert volumes.files == {'f1': [DEFAULT_STORE]}
    # Error cases when no default store is given.
    with pytest.raises(ValueError):
        VolumeManager(stores=list())
    with pytest.raises(ValueError):
        VolumeManager(stores=[FStore(basedir=tmpdir, identifier='0000')])
    # Error for unknown storage volume.
    with pytest.raises(err.UnknownObjectError):
        VolumeManager(
            stores=[default_store],
            files={'f1': ['unknown']}
        )
Ejemplo n.º 6
0
def run_workflow(run_id: str, state: WorkflowState, output_files: List[str],
                 steps: List[ContainerStep], arguments: Dict,
                 volumes: VolumeManager,
                 workers: WorkerPool) -> Tuple[str, str, Dict]:
    """Execute a list of workflow steps synchronously.

    This is the worker function for asynchronous workflow executions. Returns a
    tuple containing the run identifier, the folder with the run files, and a
    serialization of the workflow state.

    Parameters
    ----------
    run_id: string
        Unique run identifier
    state: flowserv.model.workflow.state.WorkflowState
        Current workflow state (to access the timestamps)
    output_files: list(string)
        Relative path of output files that are generated by the workflow run
    steps: list of flowserv.model.workflow.step.WorkflowStep
        Steps in the serial workflow that are executed in the given context.
    arguments: dict
        Dictionary of argument values for parameters in the template.
    volumes: flowserv.volume.manager.VolumeManager
        Factory for storage volumes.
    workers: flowserv.controller.worker.manager.WorkerPool
        Factory for :class:`flowserv.model.workflow.step.ContainerStep` steps.

    Returns
    -------
    (string, string, dict)
    """
    logging.info('start run {}'.format(run_id))
    runstore = volumes.get(DEFAULT_STORE)
    try:
        run_result = exec_workflow(steps=steps,
                                   workers=workers,
                                   volumes=volumes,
                                   result=RunResult(arguments=arguments))
        if run_result.returncode != 0:
            # Return error state. Include STDERR in result
            messages = run_result.log
            result_state = state.error(messages=messages)
            doc = serialize.serialize_state(result_state)
            return run_id, runstore.to_dict(), doc
        # Workflow executed successfully
        result_state = state.success(files=output_files)
    except Exception as ex:
        logging.error(ex, exc_info=True)
        strace = util.stacktrace(ex)
        logging.debug('\n'.join(strace))
        result_state = state.error(messages=strace)
    logging.info('finished run {}: {}'.format(run_id, result_state.type_id))
    return run_id, runstore.to_dict(), serialize.serialize_state(result_state)
Ejemplo n.º 7
0
def run_workflow(workflow: SerialWorkflow,
                 arguments: Dict,
                 df: pd.DataFrame,
                 worker: Optional[Dict] = None,
                 volume: Optional[Dict] = None,
                 managers: Optional[Dict] = None,
                 verbose: Optional[bool] = True) -> RunResult:
    """Run a given workflow representing a Metanome profiling algorithm on the
    given data frame.

    Returns the run result. If execution of the Metanome algorithm fails a
    RuntimeError will be raised.

    This implementation assumes that all algorithms operate on a single input
    file that contains a serialization of the data frame and that they all
    produce a single output file in Json format.

    Parameters
    ----------
    workflow: flowserv.controller.serial.workflow.base.SerialWorkflow
        Serial workflow to run a Metanome profiling algorithm on a given data
        frame.
    arguments: dict
        Dictionary of algorithm-specific input arguments.
    df: pd.DataFrame
        Input data frame.
    worker: dict, default=None
        Optional configuration for the main worker.
    volume: dict, default=None
        Optional configuration for the volume that is associated with the main
        worker.
    managers: dict, default=None
        Mapping of workflow step identifier to the worker that is used to
        execute them.
    verbose: bool, default=True
        Output run logs if True.

    Returns
    -------
    flowserv.controller.serial.workflow.result.RunResult
    """
    # Create a temporary run directory for input and output files.
    rundir = tempfile.mkdtemp()
    # Create a subfolder for input and output files. This is important when
    # running the workflow in a Docker container since these folders will
    # be mounted automatically as volumes into the container to provide
    # access to the files.
    os.makedirs(os.path.join(rundir, 'data'))
    # Create a copy of the workflow-specific arguments and add the data frame
    # and the input and output files.
    args = dict(arguments)
    args['df'] = df
    args['inputfile'] = DATA_FILE
    args['outputfile'] = RESULT_FILE
    # Create factory objects for storage volumes.
    stores = [FStore(basedir=rundir, identifier=DEFAULT_STORE)]
    if volume:
        stores.append(volume)
    volumes = VolumeManager(stores=stores, files=[])
    # Create factory for workers. Include mapping of workflow steps to
    # the worker that are responsible for their execution.
    workers = WorkerPool(workers=[worker] if worker else [], managers=managers)
    # Run the workflow and return the result. Make sure to cleanup the temporary
    # run filder. This assumes that the workflow steps have read any output
    # file into main memory or copied it to a target destination.
    try:
        r = workflow.run(arguments=args, workers=workers, volumes=volumes)
        # Output STDOUT and STDERR before raising a potential error.
        if verbose:
            for line in r.log:
                print(line)
        # Raise error if run execution was not successful.
        r.raise_for_status()
        return r
    finally:
        # Remove the created run directory.
        shutil.rmtree(rundir)
Ejemplo n.º 8
0
def test_run_with_two_steps(tmpdir):
    """Test executing a sequence of two code steps that operate on the same
    file in different storage volumes.
    """
    # -- Setup ----------------------------------------------------------------
    # Create two separate storage volumes.
    vol1_dir = os.path.join(tmpdir, 'v1')
    os.makedirs(vol1_dir)
    vol2_dir = os.path.join(tmpdir, 'v2')
    volumes = VolumeManager(stores=[
        FStore(basedir=vol1_dir, identifier=DEFAULT_STORE),
        FStore(basedir=vol2_dir, identifier='v2')
    ],
                            files={'data.json': [DEFAULT_STORE]})
    # Create data.json file in v1.
    with open(os.path.join(vol1_dir, 'data.json'), 'w') as f:
        json.dump({"value": 5}, f)
    # Use separate workers for each step.
    workers = WorkerPool(workers=[
        Code(identifier='w1', volume=DEFAULT_STORE),
        Code(identifier='w2', volume='v2')
    ],
                         managers={
                             's1': 'w1',
                             's2': 'w2'
                         })
    # Create workflow steps.
    steps = [
        CodeStep(identifier='s1',
                 func=multi_by_x,
                 arg='s1',
                 varnames={'x': 'x1'},
                 inputs=['data.json']),
        CodeStep(identifier='s2',
                 func=multi_by_x,
                 arg='s2',
                 varnames={'x': 'x2'},
                 inputs=['data.json'])
    ]
    # Initialize the workflow context arguments.
    arguments = {'filename': 'data.json', 'x1': 2, 'x2': 3}
    # -- Test workflow run ----------------------------------------------------
    run_result = exec_workflow(steps=steps,
                               workers=workers,
                               volumes=volumes,
                               result=RunResult(arguments=arguments))
    assert len(run_result.steps) == 2
    assert run_result.context == {
        'filename': 'data.json',
        'x1': 2,
        'x2': 3,
        's1': 10,
        's2': 15
    }
    assert os.path.isfile(os.path.join(vol2_dir, 'data.json'))
    # Error case.
    os.unlink(os.path.join(vol1_dir, 'data.json'))
    run_result = exec_workflow(steps=steps,
                               workers=workers,
                               volumes=volumes,
                               result=RunResult(arguments=arguments))
    assert len(run_result.steps) == 1
    assert run_result.context == {'filename': 'data.json', 'x1': 2, 'x2': 3}