def __init__( self, cromwell=DEFAULT_CROMWELL, womtool=DEFAULT_WOMTOOL, cromwell_install_dir=DEFAULT_CROMWELL_INSTALL_DIR, womtool_install_dir=DEFAULT_WOMTOOL_INSTALL_DIR, ): """ Args: cromwell: Cromwell JAR path/URI/URL. womtool: Womtool JAR path/URI/URL. cromwell_install_dir: Local directory to install Cromwell JAR. womtool_install_dir: Local directory to install Womtool JAR. """ self._cromwell = cromwell self._womtool = womtool if not AbsPath(cromwell_install_dir).is_valid: raise ValueError( 'crommwell_install_dir is not a valid absolute ' 'path. {path}'.format(path=cromwell_install_dir) ) self._cromwell_install_dir = cromwell_install_dir if not AbsPath(womtool_install_dir).is_valid: raise ValueError( 'womtool_install_dir is not a valid absolute ' 'path. {path}'.format(path=womtool_install_dir) ) self._womtool_install_dir = womtool_install_dir
def cleanup( self, dry_run=False, num_threads=URIBase.DEFAULT_NUM_THREADS, no_lock=False ): """Cleans up workflow's root output directory. Args: dry_run: Dry-run mode. num_threads: For outputs on cloud buckets only. Number of threads for deleting individual outputs on cloud buckets in parallel. Generates one client per thread. This works like `gsutil -m rm -rf`. no_lock: No file locking. """ root = self.workflow_root if not root: logger.error( 'workflow\'s root directory cannot be found in metadata JSON. ' 'Cannot proceed to cleanup outputs.' ) return if AbsPath(root).is_valid: # num_threads is not available for AbsPath().rmdir() AbsPath(root).rmdir(dry_run=dry_run, no_lock=no_lock) else: AutoURI(root).rmdir( dry_run=dry_run, no_lock=no_lock, num_threads=num_threads )
def install_file(f, install_dir, label): """Install f locally on install_dir. If f is already local then skip it. """ if AbsPath(f).is_valid: return AbsPath(f).uri logger.info('Installing {label}... {f}'.format(label=label, f=f)) path = os.path.join(os.path.expanduser(install_dir), AutoURI(f).basename) return AutoURI(f).cp(path)
def find_dirname(s): u = AbsPath(s) if u.is_valid: for ext, recurse_fnc_for_ext in URIBase.LOC_RECURSE_EXT_AND_FNC.items( ): if u.ext == ext: _, _ = recurse_fnc_for_ext(u.read(), find_dirname) # file can be a soft-link # singularity will want to have access to both soft-link and real one # so add dirnames of both soft-link and realpath all_dirnames.append(u.dirname) all_dirnames.append(os.path.dirname(os.path.realpath(u.uri))) return None, False
def __init__( self, local_loc_dir=None, gcp_loc_dir=None, aws_loc_dir=None, gcp_service_account_key_json=None, ): """Manages work/cache/temp directories for localization on the following storages: - Local*: Local path -> local_loc_dir** - gcp: GCS bucket path -> gcp_loc_dir - aws: S3 bucket path -> aws_loc_dir * Note that it starts with capital L, which is a default backend of Cromwell's default configuration file (application.conf). ** /tmp is not recommended. This directory is very important to store intermediate files used by Cromwell/AutoURI (file transfer/localization). Also manages Google Cloud auth (key JSON file) since both Caper client/server require permission to access to storage. Args: local_loc_dir: Local cache directory to store files localized for local backends. Unlike other two directories. This directory is also used to make a working directory to store intermediate files to run Cromwell. e.g. backend.conf and workflow_opts.json. gcp_loc_dir: GCS cache directory to store files localized on GCS for gcp backend. aws_loc_dir: S3 cache directory to store files localized on S3 for aws backend. gcp_service_account_key_json: Google Cloud service account for authentication. This service account should have enough permission to storage. """ if local_loc_dir is None: local_loc_dir = os.path.join(os.getcwd(), CaperBase.DEFAULT_LOC_DIR_NAME) if not AbsPath(local_loc_dir).is_valid: raise ValueError( 'local_loc_dir should be a valid local abspath. {f}'.format( f=local_loc_dir)) if gcp_loc_dir and not GCSURI(gcp_loc_dir).is_valid: raise ValueError( 'gcp_loc_dir should be a valid GCS path. {f}'.format( f=gcp_loc_dir)) if aws_loc_dir and not S3URI(aws_loc_dir).is_valid: raise ValueError( 'aws_loc_dir should be a valid S3 path. {f}'.format( f=aws_loc_dir)) self._local_loc_dir = local_loc_dir self._gcp_loc_dir = gcp_loc_dir self._aws_loc_dir = aws_loc_dir self._set_env_gcp_app_credentials(gcp_service_account_key_json)
def __init__(self, qcs, delim='\t'): """ Args: qcs: list of QC file URIs (path/URL/S3/GCS) delim: delimiter for output ([TAB] by default) """ self._delim = delim self._jsons = [] for qc in qcs: qc = AbsPath.get_abspath_if_exists(qc) if not AutoURI(qc).exists: logger.error('File does not exists. Skipping... {uri}'.format(uri=qc)) continue s = AutoURI(qc).read() j = json.loads(s) self._jsons.append(j)
def run( self, backend, wdl, inputs=None, options=None, labels=None, imports=None, metadata_output=None, str_label=None, user=None, docker=None, singularity=None, singularity_cachedir=Singularity.DEFAULT_SINGULARITY_CACHEDIR, no_build_singularity=False, custom_backend_conf=None, max_retries=CaperWorkflowOpts.DEFAULT_MAX_RETRIES, memory_retry_multiplier=CaperWorkflowOpts. DEFAULT_MEMORY_RETRY_MULTIPLIER, gcp_monitoring_script=CaperWorkflowOpts.DEFAULT_GCP_MONITORING_SCRIPT, ignore_womtool=False, no_deepcopy=False, fileobj_stdout=None, fileobj_troubleshoot=None, work_dir=None, java_heap_run=Cromwell.DEFAULT_JAVA_HEAP_CROMWELL_RUN, java_heap_womtool=Cromwell.DEFAULT_JAVA_HEAP_WOMTOOL, dry_run=False, ): """Run a workflow using Cromwell run mode. Args: backend: Choose among Caper's built-in backends. (aws, gcp, Local, slurm, sge, pbs). Or use a backend defined in your custom backend config file (above "backend_conf" file). wdl: WDL file. inputs: Input JSON file. Cromwell's parameter -i. options: Workflow options JSON file. Cromwell's parameter -o. labels: Labels JSON file. Cromwell's parameter -l. imports: imports ZIP file. Cromwell's parameter -p. metadata_output: Output metadata file path. Metadata JSON file will be written to this path. Caper also automatiacally generates it on each workflow's root directory. Cromwell's parameter -m. str_label: Caper's string label, which will be written to labels JSON object. user: Username. If not defined, find username from system. docker: Docker image to run a workflow on. This will be overriden by "docker" attr defined in WDL's task's "runtime {} section. If this is None: Docker will not be used for this workflow. If this is an emtpy string (working like a flag): Docker will be used. Caper will try to find docker image in WDL (from a comment "#CAPER docker" or from workflow.meta.caper_docker). singularity: Singularity image to run a workflow on. This will be overriden by "singularity" attr defined in WDL's task's "runtime {} section. If this is None: Singularity will not be used for this workflow. If this is an emtpy string: Singularity will be used. Caper will try to find Singularity image in WDL (from a comment "#CAPER singularity" or from workflow.meta.caper_singularlity). singularity_cachedir: Cache directory for local Singularity images. If there is a shell environment variable SINGULARITY_CACHEDIR define then this parameter will be ignored. no_build_singularity: Do not build local singularity image. However, a local singularity image will be eventually built on env var SINGULARITY_CACHEDIR. Therefore, use this flag if you have already built it. custom_backend_conf: Backend config file (HOCON) to override Caper's auto-generated backend config. max_retries: Number of retrial for a failed task in a workflow. This applies to every task in a workflow. 0 means no retrial. "attemps" attribute in a task's metadata increments from 1 as it is retried. attempts==2 means first retrial. memory_retry_multiplier: Multiplier for the memory retry feature. See https://cromwell.readthedocs.io/en/develop/cromwell_features/RetryWithMoreMemory/ for details. ignore_womtool: Disable Womtool validation for WDL/input JSON/imports. no_deepcopy: Disable recursive localization of files defined in input JSON. Input JSON file itself will still be localized. fileobj_stdout: File-like object to write Cromwell's STDOUT. fileobj_troubleshoot: File-like object to write auto-troubleshooting after workflow is done. work_dir: Local temporary directory to store all temporary files. Temporary files mean intermediate files used for running Cromwell. For example, backend config file, workflow options file. Localized (recursively) data files defined in input JSON will NOT be stored here. They will be localized on self._local_loc_dir instead. If this is not defined, then cache directory self._local_loc_dir will be used. However, Cromwell Java process itself will run on CWD instead of this directory. java_heap_run: Java heap (java -Xmx) for Cromwell server mode. java_heap_womtool: Java heap (java -Xmx) for Womtool. dry_run: Stop before running Java command line for Cromwell. Returns: metadata_file: URI of metadata JSON file. """ if not AutoURI(wdl).exists: raise FileNotFoundError( 'WDL does not exists. {wdl}'.format(wdl=wdl)) if str_label is None and inputs: str_label = AutoURI(inputs).basename_wo_ext if work_dir is None: work_dir = self.create_timestamped_work_dir( prefix=AutoURI(wdl).basename_wo_ext) logger.info('Localizing files on work_dir. {d}'.format(d=work_dir)) if inputs: maybe_remote_file = self.localize_on_backend_if_modified( inputs, backend=backend, recursive=not no_deepcopy, make_md5_file=True) inputs = AutoURI(maybe_remote_file).localize_on(work_dir) if imports: imports = AutoURI(imports).localize_on(work_dir) elif not AbsPath(wdl).exists: # auto-zip sub WDLs only if main WDL is remote imports = WDLParser(wdl).create_imports_file(work_dir) # localize WDL to be passed to Cromwell Java wdl = AutoURI(wdl).localize_on(work_dir) if metadata_output: if not AbsPath(metadata_output).is_valid: raise ValueError( 'metadata_output is not a valid local abspath. {m}'.format( m=metadata_output)) else: metadata_output = os.path.join( work_dir, CromwellMetadata.DEFAULT_METADATA_BASENAME) backend_conf = self._caper_backend_conf.create_file( directory=work_dir, backend=backend, custom_backend_conf=custom_backend_conf) options = self._caper_workflow_opts.create_file( directory=work_dir, wdl=wdl, inputs=inputs, custom_options=options, docker=docker, singularity=singularity, singularity_cachedir=singularity_cachedir, backend=backend, max_retries=max_retries, memory_retry_multiplier=memory_retry_multiplier, gcp_monitoring_script=gcp_monitoring_script, ) labels = self._caper_labels.create_file( directory=work_dir, backend=backend, custom_labels=labels, str_label=str_label, user=user, ) if not ignore_womtool: self._cromwell.validate(wdl=wdl, inputs=inputs, imports=imports) logger.info( 'launching run: wdl={w}, inputs={i}, backend_conf={b}'.format( w=wdl, i=inputs, b=backend_conf)) th = self._cromwell.run( wdl=wdl, backend_conf=backend_conf, inputs=inputs, options=options, imports=imports, labels=labels, metadata=metadata_output, fileobj_stdout=fileobj_stdout, fileobj_troubleshoot=fileobj_troubleshoot, dry_run=dry_run, ) return th
def __recurse_zip_subworkflows(self, root_zip_dir, root_wdl_dir, imported_as_url=False, depth=0): """Recurse imported sub-WDLs in main-WDL. Unlike Cromwell, Womtool does not take imports.zip while validating WDLs. All sub-WDLs should be in a correct directory structure relative to the root WDL. For Womtool, we should make a temporary directory and unpack imports.zip there and need to make a copy of root WDL on it. Then run Womtool to validate them. This function is to make such imports.zip. Sub-WDLs imported as relative path simply inherit parent's directory. Sub-WDLs imported as URL does not inherit parent's directory but root WDL's directory. Sub-WDLs imported as absolute path are not allowed. This can work with "caper run" but not with "caper submit" (or Cromwell submit). Args: depth: Recursion depth Returns: Total number of subworkflows: Sub WDL files "recursively" localized on "root_zip_dir". """ if depth > WDLParser.RECURSION_DEPTH_LIMIT: raise ValueError( 'Reached recursion depth limit while zipping subworkflows recursively. ' 'Possible cyclic import or self-refencing in WDLs? wdl={wdl}'. format(wdl=self._wdl)) if imported_as_url: main_wdl_dir = root_wdl_dir else: main_wdl_dir = AbsPath(self._wdl).dirname num_sub_wf_packed = 0 for sub_rel_to_parent in self.imports: sub_wdl_file = AutoURI(sub_rel_to_parent) if isinstance(sub_wdl_file, HTTPURL): sub_abs = sub_wdl_file.uri imported_as_url_sub = True elif isinstance(sub_wdl_file, AbsPath): raise ValueError( 'For sub WDL zipping, absolute path is not allowed for sub WDL. ' 'main={main}, sub={sub}'.format(main=self._wdl, sub=sub_rel_to_parent)) else: sub_abs = os.path.realpath( os.path.join(main_wdl_dir, sub_rel_to_parent)) if not AbsPath(sub_abs).exists: raise FileNotFoundError( 'Sub WDL does not exist. Did you import main WDL ' 'as a URL but sub WDL references a local file? ' 'main={main}, sub={sub}, imported_as_url={i}'.format( main=self._wdl, sub=sub_rel_to_parent, i=imported_as_url)) if not sub_abs.startswith(root_wdl_dir): raise ValueError( 'Sub WDL exists but it is out of root WDL directory. ' 'Too many "../" in your sub WDL? ' 'Or main WDL is imported as an URL but sub WDL ' 'has "../"? ' 'main={main}, sub={sub}, imported_as_url={i}'.format( main=self._wdl, sub=sub_rel_to_parent, i=imported_as_url)) # make a copy on zip_dir rel_path = os.path.relpath(sub_abs, root_wdl_dir) cp_dest = os.path.join(root_zip_dir, rel_path) AbsPath(sub_abs).cp(cp_dest) num_sub_wf_packed += 1 imported_as_url_sub = False num_sub_wf_packed += WDLParser(sub_abs).__recurse_zip_subworkflows( root_zip_dir=root_zip_dir, root_wdl_dir=root_wdl_dir, imported_as_url=imported_as_url_sub, depth=depth + 1, ) return num_sub_wf_packed