def create_workflow( self, run: RunObject, template: WorkflowTemplate, arguments: Dict, staticfs: StorageVolume ) -> RemoteWorkflowHandle: """Create a new instance of a workflow from the given workflow template and user-provided arguments. Parameters ---------- run: flowserv.model.base.RunObject Handle for the run that is being executed. template: flowserv.model.template.base.WorkflowTemplate Workflow template containing the parameterized specification and the parameter declarations. arguments: dict Dictionary of argument values for parameters in the template. staticfs: flowserv.volume.base.StorageVolume Storage volume that contains the static files from the workflow template. Returns ------- flowserv.controller.remote.client.RemoteWorkflowHandle """ # Create a serial workfow to have a workflow handle. return RemoteWorkflowHandle( run_id=run.run_id, workflow_id=run.run_id, state=self.state, output_files=[], runstore=staticfs.get_store_for_folder(util.join('runs', run.run_id)), client=self )
def walk(self, src: str) -> List[Tuple[str, IOHandle]]: """Get list of all files at the given source path. If the source path references a single file the returned list will contain a single entry. If the source specifies a folder the result contains a list of all files in that folder and the subfolders. Parameters ---------- src: str Source path specifying a file or folder. Returns ------- list of tuples (str, flowserv.volume.base.IOHandle) """ dirpath = util.filepath(key=src, sep=self.client.sep) dirpath = self.client.sep.join([self.remotedir, dirpath ]) if dirpath else self.remotedir files = self.client.walk(dirpath=dirpath) if files is None: # The source path references a single file. filename = util.filepath(key=src, sep=self.client.sep) filename = self.client.sep.join([self.remotedir, filename]) return [(src, SFTPFile(filename=filename, client=self.client))] else: # The source path references a directory. result = list() for key in files: key = util.join(src, key) if src else key filename = util.filepath(key=key, sep=self.client.sep) filename = self.client.sep.join([self.remotedir, filename]) result.append( (key, SFTPFile(filename=filename, client=self.client))) return result
def walk(self, src: str) -> List[Tuple[str, IOHandle]]: """Get list of all files at the given source path. If the source path references a single file the returned list will contain a single entry. If the source specifies a folder the result contains a list of all files in that folder and the subfolders. Parameters ---------- src: str Source path specifying a file or folder. Returns ------- list of tuples (str, flowserv.volume.base.IOHandle) """ # Ensure that the key key ends with a path separator if the key is not # empty. if src and src[-1] != '/': prefix = '{}/'.format(src) elif not src: prefix = '' else: prefix = src return self.query(filter=util.join(self.prefix, prefix))
def success_run(database: DB, fs: StorageVolume, basedir: str) -> Tuple[str, str, str, str]: """Create a successful run with two result files: - A.json - results/B.json Returns the identifier of the created workflow, group, run, and user. """ # Setup temporary run folder. runfs = FileSystemStorage(basedir=os.path.join(basedir, 'tmprun')) runfs.store(file=io_file({'A': 1}), dst='A.json') runfs.store(file=io_file({'B': 1}), dst=util.join('results', 'B.json')) with database.session() as session: user_id = create_user(session, active=True) workflow_id = create_workflow(session) group_id = create_group(session, workflow_id, users=[user_id]) groups = WorkflowGroupManager(session=session, fs=fs) runs = RunManager(session=session, fs=fs) run = runs.create_run(group=groups.get_group(group_id)) run_id = run.run_id state = run.state() runs.update_run( run_id=run_id, state=state.start().success(files=['A.json', 'results/B.json']), runstore=runfs) return workflow_id, group_id, run_id, user_id
def run_tmpdir() -> str: """Get path to a temporary workflow run directory. Returns ------- string """ return util.join('tmp', util.get_unique_identifier())
def copy_files( src: Union[str, List[str]], source: StorageVolume, dst: str, target: StorageVolume, verbose: Optional[bool] = False ) -> List[str]: """Copy files and folders at the source path (path) of a given source storage volume to the destination path (path) of a target storage volume. Returns the list of files that were copied. Parameters ---------- src: str or list of string Path specifying the source file(s) or folder(s). source: flowserv.volume.base.StorageValue Storage volume for source files. dst: string Destination path for copied files. target: flowserv.volume.base.StorageValue Storage volume for destination files. verbose: bool, default=False Print information about source and target volume and the files that are being copied. Returns ------- list of string """ if verbose: print('Copy files from {} to {}'.format(source.describe(), target.describe())) files = list() for path in src if isinstance(src, list) else [src]: # Get list of source files to copy. If a single element is returned # with a key that equals the 'path' then we are copying a file. In this # case the source path is copied to the given dst path (or the source # path is dst is None). If we are opying a directory and the destination # path is given, we remove the 'path' from all keys. source_files = source.walk(src=path) if len(source_files) == 1 and source_files[0][0] == path: # We are copying a single file. _, file = source_files[0] dstpath = dst if dst is not None else path files.append(dstpath) target.store(file=file, dst=dstpath) if verbose: print('copied {} to {}'.format(path, dstpath)) else: # We are copying a directory. If the destination path is given, # make sure to remove the 'path' from all keys. for key, file in source_files: if path: prefix = path + '/' key = key[len(prefix):] dstpath = util.join(dst, key) if dst else key files.append(dstpath) target.store(file=file, dst=dstpath) if verbose: print('copied {} to {}'.format(key, dstpath)) return files
def test_fs_volume_walk(basedir, filenames_all): """Test listing files in a directory.""" store = FileSystemStorage(basedir=basedir) # -- Full directory. files = store.walk(src='') assert set([key for key, _ in files]) == filenames_all # -- Sub-directory. files = store.walk(src='examples') keys = set([key for key, _ in files]) assert keys == {'examples/B.json', 'examples/C.json', 'examples/data/data.json'} files = store.walk(src=util.join('examples', 'data')) assert set([key for key, _ in files]) == {'examples/data/data.json'} # -- Single file. files = store.walk(src=util.join('docs', 'D.json')) assert set([key for key, _ in files]) == {'docs/D.json'} # -- Unknown file or directory. files = store.walk(src=util.join('docs', 'E.json')) assert files == []
def delete(self, key: str): """Delete file or folder with the given key. Parameters ---------- key: str Path to a file object in the storage volume. """ self.delete_objects(keys=self.query(filter=util.join(self.prefix, key)))
def delete(self, key: str): """Delete file or folder with the given key. Parameters ---------- key: str Path to a file object in the storage volume. """ keys = self.query(filter=util.join(self.prefix, key)) self.bucket.delete_objects(Delete={'Objects': [{'Key': k} for k in keys]})
def store(self, file: IOHandle, dst: str): """Store a given file object at the destination path of this volume store. Parameters ---------- file: flowserv.volume.base.IOHandle File-like object that is being stored. dst: str Destination path for the stored object. """ self.bucket.upload_fileobj(file.open(), util.join(self.prefix, dst))
def prepare(self, store: StorageVolume, inputs: List[str], outputs: List[str]): """Prepare the storage volume for a worker. Ensures that the input files that are needed by the worker are available in their latest version at the given volume store. Raises a ValueError if a specified input file does not exist. Parameters ---------- store: flowserv.volume.base.StorageVolume Storage volume that is being prepared. inputs: list of string Relative path (keys) of required input files for a workflow step. outputs: list of string Relative path (keys) of created output files by a workflow step. """ # Generate dictionary that maps all files that are matches to the given # query list to the list of storage volume that the files are available # at. At this point we perform a search with quadratic time complexity # in the number of query files and and files in the workflow context, # assuming that neither (or at least the query files) contains a very # large number of elements. required_files = dict() for q in inputs: # The comparison depends on whether the specified file name ends # with a '/' (indicating that a directory is referenced) or not. is_match = prefix_match if q.endswith('/') else exact_match for f, fstores in self.files.items(): if f not in required_files and is_match(f, q): required_files[f] = fstores # Copy required files that are currently not available to the worker. for f, fstores in required_files.items(): # Check if the file is available at the target store. if store.identifier in fstores: continue # If the file is not available at the target volume we need to # upload it. source = self.get(fstores[0]) # Upload file from the source storage volume to the target # volume. for key in source.copy(src=f, store=store): self.files[key].append(store.identifier) # Create folders for output files. out_folders = set() for file in outputs: parent = file if file.endswith('/') else util.join( *file.split('/')[:-1]) out_folders.add(parent) for dirname in out_folders: store.mkdir(path=dirname)
def upload_file(self, group_id: str, file: IOHandle, name: str): """Upload a new file for a workflow group. This will create a copy of the given file in the file store that is associated with the group. The file will be places in a unique folder inside the groups upload folder. Raises an error if the given file name is invalid. Parameters ---------- group_id: string Unique group identifier file: flowserv.volume.base.IOHandle File object (e.g., uploaded via HTTP request) name: string Name of the file Returns ------- flowserv.model.base.UploadFile Raises ------ flowserv.error.ConstraintViolationError flowserv.error.UnknownWorkflowGroupError """ # Get the group object to ensure that the group exists. group = self.get_group(group_id) # Ensure that the given file name is valid constraint.validate_name(name) # Create a new unique identifier for the file and save the file object # to the new file path. file_id = util.get_unique_identifier() uploaddir = dirs.group_uploaddir(workflow_id=group.workflow_id, group_id=group.group_id) # Get file size. file_size = file.size() # Attempt to guess the Mime type for the uploaded file from the file # name. mime_type, _ = mimetypes.guess_type(url=name) dst = util.join(uploaddir, file_id) self.fs.store(file=file, dst=dst) # Insert information into database and return handle for uploaded file. fileobj = UploadFile(file_id=file_id, created_at=util.utc_now(), key=dst, name=name, mime_type=mime_type, size=file_size) group.uploads.append(fileobj) return fileobj
def workflow_staticdir(workflow_id: str) -> str: """Get base directory containing static files that are associated with a workflow template. Parameters ---------- workflow_id: string Unique workflow identifier Returns ------- string """ return util.join(workflow_basedir(workflow_id), 'static')
def load(self, key: str) -> IOHandle: """Load a file object at the source path of this volume store. Returns a file handle that can be used to open and read the file. Parameters ---------- key: str Path to a file object in the storage volume. Returns -------- flowserv.volume.base.IOHandle """ return S3File(key=util.join(self.prefix, key), bucket=self.bucket)
def group_uploaddir(workflow_id: str, group_id: str) -> str: """Get base directory for files that are uploaded to a workflow group. Parameters ---------- workflow_id: string Unique workflow identifier group_id: string Unique workflow group identifier Returns ------- string """ groupdir = workflow_groupdir(workflow_id, group_id) return util.join(groupdir, 'files')
def run_basedir(workflow_id: str, run_id: str) -> str: """Get path to the base directory for all files that are maintained for a workflow run. Parameters ---------- workflow_id: string Unique workflow identifier run_id: string Unique run identifier Returns ------- string """ workflowdir = workflow_basedir(workflow_id) return util.join(workflowdir, 'runs', run_id)
def workflow_groupdir(workflow_id: str, group_id: str) -> str: """Get base directory containing files that are associated with a workflow group. Parameters ---------- workflow_id: string Unique workflow identifier group_id: string Unique workflow group identifier Returns ------- string """ workflowdir = workflow_basedir(workflow_id) return util.join(workflowdir, 'groups', group_id)
def prepare_postproc_data(input_files: List[str], ranking: List[RunResult], run_manager: RunManager, store: StorageVolume): """Create input files for post-processing steps for a given set of runs. Creates files for a post-processing run in a given base directory on a storage volume. The resulting directory contains files for each run in a given ranking. For each run a sub-folder with the run identifier as the directory name is created. Each folder contains copies of result files for the run for those files that are specified in the input files list. A file ``runs.json`` in the base directory lists the runs in the ranking together with their group name. Parameters ---------- input_files: list(string) List of identifier for benchmark run output files that are copied into the input directory for each submission. ranking: list(flowserv.model.ranking.RunResult) List of runs in the current result ranking run_manager: flowserv.model.run.RunManager Manager for workflow runs store: flowserv.volume.base.StorageVolume Target storage volume where the created post-processing files are stored. """ # Collect information about runs and their result files. runs = list() for entry in ranking: run_id = entry.run_id group_name = entry.group_name # Create a sub-folder for the run in the output directory. Then copy # all given files into the created directory. rundir = run_id for key in input_files: # Copy run file to target file. file = run_manager.get_runfile(run_id=run_id, key=key) dst = util.join(rundir, key) store.store(file=file, dst=dst) runs.append({ LABEL_ID: run_id, LABEL_NAME: group_name, LABEL_FILES: input_files }) store.store(file=io_file(runs), dst=RUNS_FILE)
def get_runfile(self, run_id: str, file_id: str = None, key: str = None) -> FileHandle: """Get handle and file object for a given run result file. The file is either identified by the unique file identifier or the file key. Raises an error if the specified file does not exist. Parameters ---------- run_id: string Unique run identifier. file_id: string Unique file identifier. Returns ------- flowserv.model.files.FileHandle Raises ------ flowserv.error.UnknownFileError ValueError """ # Raise an error if both or neither file_id and key are given. if file_id is None and key is None: raise ValueError('no arguments for file_id or key') elif not (file_id is None or key is None): raise ValueError('invalid arguments for file_id and key') run = self.get_run(run_id) if file_id: fh = run.get_file(by_id=file_id) else: fh = run.get_file(by_key=key) if fh is None: raise err.UnknownFileError(file_id) # Return file handle for resource file workflow_id = run.workflow.workflow_id rundir = dirs.run_basedir(workflow_id=workflow_id, run_id=run_id) return FileHandle(name=fh.name, mime_type=fh.mime_type, fileobj=self.fs.load(util.join(rundir, fh.key)))
def get_store_for_folder(self, key: str, identifier: Optional[str] = None) -> StorageVolume: """Get storage volume for a sob-folder of the given volume. Parameters ---------- key: string Relative path to sub-folder. The concatenation of the base folder for this storage volume and the given key will form te new base folder for the returned storage volume. identifier: string, default=None Unique volume identifier. Returns ------- flowserv.volume.base.StorageVolume """ return S3Volume( bucket_id=self.bucket_id, prefix=util.join(self.prefix, key), identifier=identifier )
def get_runarchive(self, run_id: str) -> FileHandle: """Get tar archive containing all result files for a given workflow run. Raises UnknownRunError if the run is not in SUCCESS state. Parameters ---------- run_id: string Unique run identifier. Returns ------- flowserv.model.files.FileHandle Raises ------ flowserv.error.UnknownRunError """ # Get the run handle and ensure that the run is in SUCCESS state. run = self.get_run(run_id) if not run.is_success(): raise err.UnknownRunError(run_id) # Create a memory buffer for the tar file. io_buffer = io.BytesIO() tar_handle = tarfile.open(fileobj=io_buffer, mode='w:gz') # Get file objects for all run result files. workflow_id = run.workflow.workflow_id rundir = dirs.run_basedir(workflow_id=workflow_id, run_id=run_id) for f in run.files: file = self.fs.load(util.join(rundir, f.key)).open() info = tarfile.TarInfo(name=f.key) info.size = file.getbuffer().nbytes tar_handle.addfile(tarinfo=info, fileobj=file) tar_handle.close() io_buffer.seek(0) # Create file handle for the archive. The file name includes the run # identifier. The mime type is 'application/gzip' based on # https://superuser.com/questions/901962. return FileHandle(name='run.{}.tar.gz'.format(run_id), mime_type='application/gzip', fileobj=IOBuffer(io_buffer))
def walkdir(dirname: str, prefix: str, files: List[Tuple[str, IOHandle]]) -> List[Tuple[str, IOHandle]]: """Recursively add all files in a given source folder to a file upload list. The elements in the list are tuples of file object and relative target path. Parameters ---------- dirname: string Path to folder of the local file system. prefix: string Relative destination path for all files in the folder. files: list of (string, flowserv.volume.base.IOHandle) Pairs of file objects and their relative target path for upload to a file store. """ for filename in os.listdir(dirname): file = os.path.join(dirname, filename) key = util.join(prefix, filename) if prefix else filename if os.path.isdir(file): walkdir(dirname=file, prefix=key, files=files) else: files.append((key, FSFile(filename=file))) return files
def copyfiles(self, dst: str) -> List[Tuple[str, str]]: """Get list of all template files from the base folder that need to be copied to the template folder of a workflow repository. The result is a list of tuples specifying the relative file source and target path. The target path for each file is a concatenation of the given destination base directory and the specified target path for the file or folder. If the list of files is undefined in the manifest, the result is a tuple (None, dst) indicating that the full base directory is to be copied to the destination. Returns ------ list of (string, string) """ if self.files is None: return [(None, dst)] result = list() for f in self.files: source = f['source'] target = f.get('target', source) target = target if dst is None else util.join(dst, target) result.append((source, target)) return result
def exec_workflow( self, run: RunObject, template: WorkflowTemplate, arguments: Dict, staticfs: StorageVolume, config: Optional[Dict] = None ) -> Tuple[WorkflowState, StorageVolume]: """Initiate the execution of a given workflow template for a set of argument values. This will start a new process that executes a serial workflow asynchronously. The serial workflow engine executes workflows on the local machine and therefore uses the file system to store temporary run files. The path to the run folder is returned as the second value in the result tuple. The first value in the result tuple is the state of the workflow after the process is stated. If the workflow is executed asynchronously the state will be RUNNING. Otherwise, the run state should be an inactive state. The set of arguments is not further validated. It is assumed that the validation has been performed by the calling code (e.g., the run service manager). The optional configuration object can be used to override the worker configuration that was provided at object instantiation. Expects a dictionary with an element `workers` that contains a mapping of container identifier to a container worker configuration object. If the state of the run handle is not pending, an error is raised. Parameters ---------- run: flowserv.model.base.RunObject Handle for the run that is being executed. template: flowserv.model.template.base.WorkflowTemplate Workflow template containing the parameterized specification and the parameter declarations. arguments: dict Dictionary of argument values for parameters in the template. staticfs: flowserv.volume.base.StorageVolume Storage volume that contains the static files from the workflow template. config: dict, default=None Optional object to overwrite the worker configuration settings. Returns ------- flowserv.model.workflow.state.WorkflowState, flowserv.volume.base.StorageVolume """ # Get the run state. Raise an error if the run is not in pending state. if not run.is_pending(): raise RuntimeError("invalid run state '{}'".format(run.state)) state = run.state() # Create configuration dictionary that merges the engine global # configuration with the workflow-specific one. run_config = self.config if self.config is not None else dict() if config: run_config.update(config) # Get the list of workflow steps, run arguments, and the list of output # files that the workflow is expected to generate. steps, run_args, outputs = parser.parse_template(template=template, arguments=arguments) # Create and prepare storage volume for run files. runstore = self.fs.get_store_for_folder(key=util.join( self.runsdir, run.run_id), identifier=DEFAULT_STORE) try: # Copy template files to the run folder. files = staticfs.copy(src=None, store=runstore) # Store any given file arguments and additional input files # that are required by actor parameters into the run folder. for key, para in template.parameters.items(): if para.is_file() and key in arguments: for key in arguments[key].copy(target=runstore): files.append(key) elif para.is_actor() and key in arguments: input_files = arguments[key].files for f in input_files if input_files else []: for key in f.copy(target=runstore): files.append(key) # Create factory objects for storage volumes. volumes = volume_manager(specs=run_config.get('volumes', []), runstore=runstore, runfiles=files) # Create factory for workers. Include mapping of workflow steps to # the worker that are responsible for their execution. workers = WorkerPool(workers=run_config.get('workers', []), managers={ doc['step']: doc['worker'] for doc in run_config.get('workflow', []) }) # Start a new process to run the workflow. Make sure to catch all # exceptions to set the run state properly. state = state.start() if self.is_async: # Run steps asynchronously in a separate process pool = Pool(processes=1) task_callback_function = partial(callback_function, lock=self.lock, tasks=self.tasks, service=self.service) with self.lock: self.tasks[run.run_id] = (pool, state) pool.apply_async(run_workflow, args=(run.run_id, state, outputs, steps, run_args, volumes, workers), callback=task_callback_function) return state, runstore else: # Run steps synchronously and block the controller until done _, _, state_dict = run_workflow(run_id=run.run_id, state=state, output_files=outputs, steps=steps, arguments=run_args, volumes=volumes, workers=workers) return serialize.deserialize_state(state_dict), runstore except Exception as ex: # Set the workflow run into an ERROR state logging.error(ex, exc_info=True) return state.error(messages=util.stacktrace(ex)), runstore
import flowserv.util as util """Names for files and folders that contain run result files and run metadata. """ RUNS_DIR = 'runs' RUNS_FILE = 'runs.json' """Labels for metadata objects in the run listing.""" LABEL_ID = 'id' LABEL_NAME = 'name' LABEL_FILES = 'files' """Fixed set of parameter declarations for post-processing workflows. Contains only the declaration for the runs folder. """ PARA_RUNS = 'runs' PARAMETER = File(name=PARA_RUNS, index=0, target=util.join(RUNS_DIR, RUNS_FILE)) PARAMETERS = ParameterIndex() PARAMETERS[PARAMETER.name] = PARAMETER # -- Helper functions --------------------------------------------------------- def prepare_postproc_data(input_files: List[str], ranking: List[RunResult], run_manager: RunManager, store: StorageVolume): """Create input files for post-processing steps for a given set of runs. Creates files for a post-processing run in a given base directory on a storage volume. The resulting directory contains files for each run in a given ranking. For each run a sub-folder with the run identifier as the directory name is created. Each folder contains copies of result files for the run for those files that are specified in the input files list. A file
def test_join(): """Ensure that empty and None strings are ignored by join.""" assert util.join(None, 'a', '', 'b') == 'a/b'