def add_to_input_files_if_valid(file): nonlocal file_size_cache nonlocal file_sizes_dict nonlocal input_name if GCSURI(file).is_valid: file_size = file_size_cache.get(file) if file_size is None: file_size = GCSURI(file).size file_size_cache[file] = file_size file_sizes_dict[input_name].append(file_size)
def __init__( self, local_loc_dir=None, gcp_loc_dir=None, aws_loc_dir=None, gcp_service_account_key_json=None, ): """Manages work/cache/temp directories for localization on the following storages: - Local*: Local path -> local_loc_dir** - gcp: GCS bucket path -> gcp_loc_dir - aws: S3 bucket path -> aws_loc_dir * Note that it starts with capital L, which is a default backend of Cromwell's default configuration file (application.conf). ** /tmp is not recommended. This directory is very important to store intermediate files used by Cromwell/AutoURI (file transfer/localization). Also manages Google Cloud auth (key JSON file) since both Caper client/server require permission to access to storage. Args: local_loc_dir: Local cache directory to store files localized for local backends. Unlike other two directories. This directory is also used to make a working directory to store intermediate files to run Cromwell. e.g. backend.conf and workflow_opts.json. gcp_loc_dir: GCS cache directory to store files localized on GCS for gcp backend. aws_loc_dir: S3 cache directory to store files localized on S3 for aws backend. gcp_service_account_key_json: Google Cloud service account for authentication. This service account should have enough permission to storage. """ if local_loc_dir is None: local_loc_dir = os.path.join(os.getcwd(), CaperBase.DEFAULT_LOC_DIR_NAME) if not AbsPath(local_loc_dir).is_valid: raise ValueError( 'local_loc_dir should be a valid local abspath. {f}'.format( f=local_loc_dir)) if gcp_loc_dir and not GCSURI(gcp_loc_dir).is_valid: raise ValueError( 'gcp_loc_dir should be a valid GCS path. {f}'.format( f=gcp_loc_dir)) if aws_loc_dir and not S3URI(aws_loc_dir).is_valid: raise ValueError( 'aws_loc_dir should be a valid S3 path. {f}'.format( f=aws_loc_dir)) self._local_loc_dir = local_loc_dir self._gcp_loc_dir = gcp_loc_dir self._aws_loc_dir = aws_loc_dir self._set_env_gcp_app_credentials(gcp_service_account_key_json)
def init_autouri(args): """Initialize Autouri and its logger Args: args: dict of cmd line arguments """ GCSURI.init_gcsuri(use_gsutil_for_s3=args['use_gsutil_for_s3']) # autouri's path to url mapping if args['tsv_mapping_path_to_url'] is not None: mapping_path_to_url = {} f = os.path.expanduser(args['tsv_mapping_path_to_url']) with open(f, newline='') as fp: reader = csv.reader(fp, delimiter='\t') for line in reader: mapping_path_to_url[line[0]] = line[1] args['mapping_path_to_url'] = mapping_path_to_url else: args['mapping_path_to_url'] = None
def gcp_monitor_call(call_name, call, parent_call_names): nonlocal excluded_cols nonlocal stat_methods nonlocal file_size_cache nonlocal workflow_id nonlocal task_name if task_name and task_name != call_name: return monitoring_log = call.get('monitoringLog') if monitoring_log is None: return if not GCSURI(monitoring_log).is_valid: # This feature is for GCSURI only. return if not GCSURI(monitoring_log).exists: # Workaround for Cromwell-52's bug. # Call-cached task has `monitoringLog`, but it does not exist. return dataframe = pd.read_csv( io.StringIO(GCSURI(monitoring_log).read()), delimiter='\t' ) rt_attrs = call.get('runtimeAttributes') data = { 'workflow_id': workflow_id, 'task_name': call_name, 'shard_idx': call.get('shardIndex'), 'status': call.get('executionStatus'), 'attempt': call.get('attempt'), 'instance': { 'cpu': int(rt_attrs.get('cpu')), 'disk': parse_cromwell_disks(rt_attrs.get('disks')), 'mem': parse_cromwell_memory(rt_attrs.get('memory')), }, 'stats': {s: {} for s in stat_methods}, 'input_file_sizes': defaultdict(list), } for i, col_name in enumerate(dataframe.columns): if i in excluded_cols: continue for stat_method in stat_methods: if dataframe.empty: val = None elif stat_method == 'last': last_idx = dataframe.tail(1).index.item() val = dataframe[col_name][last_idx] else: val = getattr(dataframe[col_name], stat_method)() data['stats'][stat_method][col_name] = val for input_name, input_value in sorted(call['inputs'].items()): file_sizes_dict = data['input_file_sizes'] def add_to_input_files_if_valid(file): nonlocal file_size_cache nonlocal file_sizes_dict nonlocal input_name if GCSURI(file).is_valid: file_size = file_size_cache.get(file) if file_size is None: file_size = GCSURI(file).size file_size_cache[file] = file_size file_sizes_dict[input_name].append(file_size) recurse_dict_value(input_value, add_to_input_files_if_valid) return data
def init_autouri(args): if hasattr(args, 'use_gsutil_for_s3'): GCSURI.init_gcsuri(use_gsutil_for_s3=args.use_gsutil_for_s3)
def create_file( self, directory, wdl, backend=None, inputs=None, custom_options=None, docker=None, singularity=None, singularity_cachedir=None, no_build_singularity=False, max_retries=DEFAULT_MAX_RETRIES, memory_retry_multiplier=DEFAULT_MEMORY_RETRY_MULTIPLIER, gcp_monitoring_script=DEFAULT_GCP_MONITORING_SCRIPT, basename=BASENAME_WORKFLOW_OPTS_JSON, ): """Creates Cromwell's workflow options JSON file. Workflow options JSON file sets default values for attributes defined in runtime {} section of WDL's task. For example, docker attribute can be defined here instead of directory defining in task's runtime { docker: "" }. Args: directory: Directory to make workflow options JSON file. wdl: WDL file. backend: Backend to run a workflow on. If not defined, server's default or runner's Local backend will be used. inputs: Input JSON file to define input files/parameters for WDL. This will be overriden by environment variable SINGULARITY_BINDPATH. For Singularity, it is required to find SINGULARITY_BINDPATH, which is a comma-separated list of common root directories for all files defined in input JSON. Unlike Docker, Singularity binds directories instead of mounting them. Therefore, Caper will try to find an optimal SINGULARITY_BINDPATH by looking at all files paths and find common parent directories for them. custom_options: User's custom workflow options JSON file. This will be merged at the end of this function. Therefore, users can override on Caper's auto-generated workflow options JSON file. docker: Docker image to run a workflow on. singularity: Singularity image to run a workflow on. singularity_cachedir: Singularity cache directory to build local images on. This will be overriden by environment variable SINGULARITY_CACHEDIR. no_build_singularity: Caper run "singularity exec IMAGE" to build a local Singularity image before submitting/running a workflow. With this flag on, Caper does not pre-build a local Singularity container. Therefore, Singularity container will be built inside each task, which will result in multiple redundant local image building. Also, trying to build on the same Singularity image file can lead to corruption of the image file. max_retries: Maximum number of retirals for each task. 1 means 1 retrial. memory_retry_multiplier: Multiplier for the memory retry feature. See https://cromwell.readthedocs.io/en/develop/cromwell_features/RetryWithMoreMemory/ for details. gcp_monitoring_script: Monitoring script for GCP backend only. Useful to monitor resources on an instance. basename: Basename for a temporary workflow options JSON file. """ if singularity and docker: raise ValueError('Cannot use both Singularity and Docker.') template = copy.deepcopy(self._template) dra = template[CaperWorkflowOpts.DEFAULT_RUNTIME_ATTRIBUTES] if backend: template['backend'] = backend wdl_parser = CaperWDLParser(wdl) if docker == '' or backend in (BACKEND_GCP, BACKEND_AWS) and not docker: # find "caper-docker" from WDL's workflow.meta # or "#CAPER docker" from comments docker = wdl_parser.caper_docker if docker: logger.info( 'Docker image found in WDL\'s metadata. wdl={wdl}, d={d}'. format(wdl=wdl, d=docker)) else: logger.warning( "Docker image not found in WDL's metadata, which means that " "docker is not defined either as comment (#CAPER docker) or " "in workflow's meta section (under key caper_docker) in WDL. " "If your WDL already has docker defined " "in each task's runtime " "then it should be okay. wdl={wdl}".format(wdl=wdl)) if docker: dra['docker'] = docker if singularity == '': if backend in (BACKEND_GCP, BACKEND_AWS): raise ValueError( 'Singularity cannot be used for cloud backend (e.g. aws, gcp).' ) singularity = wdl_parser.caper_singularity if singularity: logger.info( 'Singularity image found in WDL\'s metadata. wdl={wdl}, s={s}' .format(wdl=wdl, s=singularity)) else: raise ValueError( 'Singularity image not found in WDL. wdl={wdl}'.format( wdl=wdl)) if singularity: dra['singularity'] = singularity if singularity_cachedir: dra['singularity_cachedir'] = singularity_cachedir s = Singularity(singularity, singularity_cachedir) if inputs: dra['singularity_bindpath'] = s.find_bindpath(inputs) if not no_build_singularity: s.build_local_image() if max_retries is not None: dra['maxRetries'] = max_retries # Cromwell's bug in memory-retry feature. # Disabled until it's fixed on Cromwell's side. # if memory_retry_multiplier is not None: # template['memory_retry_multiplier'] = memory_retry_multiplier if gcp_monitoring_script and backend == BACKEND_GCP: if not GCSURI(gcp_monitoring_script).is_valid: raise ValueError( 'gcp_monitoring_script is not a valid URI. {uri}'.format( uri=gcp_monitoring_script)) template['monitoring_script'] = gcp_monitoring_script if custom_options: s = AutoURI(custom_options).read() d = json.loads(s) merge_dict(template, d) final_options_file = os.path.join(directory, basename) AutoURI(final_options_file).write( json.dumps(template, indent=4) + '\n') return final_options_file
def test_run_gcp_with_life_sciences_api( tmp_path, gcs_root, ci_prefix, cromwell, womtool, gcp_prj, gcp_service_account_key_json, debug_caper, ): """Test run with Google Cloud Life Sciences API """ out_gcs_bucket = os.path.join(gcs_root, 'caper_out', ci_prefix) tmp_gcs_bucket = os.path.join(gcs_root, 'caper_tmp') # prepare WDLs and input JSON, imports to be submitted make_directory_with_wdls(str(tmp_path)) wdl = tmp_path / 'main.wdl' inputs = tmp_path / 'inputs.json' metadata = tmp_path / 'metadata.json' cmd = ['run', str(wdl)] cmd += ['--inputs', str(inputs)] cmd += ['-m', str(metadata)] if gcp_service_account_key_json: cmd += ['--gcp-service-account-key-json', gcp_service_account_key_json] cmd += ['--use-google-cloud-life-sciences'] cmd += ['--gcp-region', 'us-central1'] # --gcp-zones should be ignored cmd += ['--gcp-zones', 'us-west1-a,us-west1-b'] cmd += ['--gcp-prj', gcp_prj] cmd += ['--gcp-memory-retry-error-keys', 'Killed'] cmd += ['--gcp-memory-retry-multiplier', '1.5'] cmd += ['--tmp-dir', str(tmp_path / 'tmp_dir')] cmd += ['--backend', 'gcp'] cmd += ['--gcp-out-dir', out_gcs_bucket] cmd += ['--gcp-loc-dir', tmp_gcs_bucket] cmd += ['--cromwell-stdout', str(tmp_path / 'cromwell_stdout.o')] # test with file type DB cmd += ['--db', 'file'] cmd += ['--db-timeout', '500000'] cmd += ['--file-db', str(tmp_path / 'file_db_prefix')] cmd += ['--max-concurrent-tasks', '2'] cmd += ['--max-concurrent-workflows', '2'] cmd += ['--disable-call-caching'] cmd += ['--cromwell', cromwell] cmd += ['--womtool', womtool] cmd += ['--java-heap-run', '4G'] cmd += ['--docker', 'ubuntu:latest'] if debug_caper: cmd += ['--debug'] print(' '.join(cmd)) cli_main(cmd) m_dict = json.loads(metadata.read_text()) assert m_dict['status'] == 'Succeeded' # test CromwellMetadata.gcp_monitor() here # since it's for gcp only and this function is one of the two # test functions ran on a gcp backend. # task main.t1 has sleep 10 so that monitoring_script has time to # write monitoring data to `monitoringLog` file cm = CromwellMetadata(m_dict) monitor_data = cm.gcp_monitor() for data in monitor_data: instance_cpu = data['instance']['cpu'] instance_mem = data['instance']['mem'] instance_disk = data['instance']['disk'] assert instance_cpu >= 1 assert instance_mem >= 1024 * 1024 * 1024 assert instance_disk >= 10 * 1024 * 1024 * 1024 max_cpu_percent = data['stats']['max']['cpu_pct'] max_mem = data['stats']['max']['mem'] max_disk = data['stats']['max']['disk'] if max_cpu_percent or data['task_name'] == 'main.t1': assert max_cpu_percent <= 100.0 if max_mem or data['task_name'] == 'main.t1': assert max_mem <= instance_mem if max_disk or data['task_name'] == 'main.t1': assert max_disk <= instance_disk # test cleanup on gcp backend (gs://) root_out_dir = cm.data['workflowRoot'] # remote metadata JSON file on workflow's root output dir. remote_metadata_json_file = os.path.join(root_out_dir, 'metadata.json') assert GCSURI(remote_metadata_json_file).exists # dry-run should not delete anything cm.cleanup(dry_run=True) assert GCSURI(remote_metadata_json_file).exists cm.cleanup(dry_run=False) assert not GCSURI(remote_metadata_json_file).exists
def create_file( self, directory, wdl, backend=None, inputs=None, custom_options=None, docker=None, singularity=None, conda=None, max_retries=DEFAULT_MAX_RETRIES, memory_retry_multiplier=DEFAULT_MEMORY_RETRY_MULTIPLIER, gcp_monitoring_script=DEFAULT_GCP_MONITORING_SCRIPT, basename=BASENAME_WORKFLOW_OPTS_JSON, ): """Creates Cromwell's workflow options JSON file. Workflow options JSON file sets default values for attributes defined in runtime {} section of WDL's task. For example, docker attribute can be defined here instead of directory defining in task's runtime { docker: "" }. Args: directory: Directory to make workflow options JSON file. wdl: WDL file. backend: Backend to run a workflow on. If not defined, server's default or runner's Local backend will be used. inputs: Input JSON file to define input files/parameters for WDL. This will be overriden by environment variable SINGULARITY_BINDPATH. For Singularity, it is required to find SINGULARITY_BINDPATH, which is a comma-separated list of common root directories for all files defined in input JSON. Unlike Docker, Singularity binds directories instead of mounting them. Therefore, Caper will try to find an optimal SINGULARITY_BINDPATH by looking at all files paths and find common parent directories for them. custom_options: User's custom workflow options JSON file. This will be merged at the end of this function. Therefore, users can override on Caper's auto-generated workflow options JSON file. conda: Default Conda environemnt name to run a workflow. docker: Default Docker image to run a workflow on. singularity: Default Singularity image to run a workflow on. max_retries: Maximum number of retirals for each task. 1 means 1 retrial. memory_retry_multiplier: Multiplier for the memory retry feature. See https://cromwell.readthedocs.io/en/develop/cromwell_features/RetryWithMoreMemory/ for details. gcp_monitoring_script: Monitoring script for GCP backend only. Useful to monitor resources on an instance. basename: Basename for a temporary workflow options JSON file. """ if singularity and docker: raise ValueError('Cannot use both Singularity and Docker.') template = copy.deepcopy(self._template) default_runtime_attributes = template[ CaperWorkflowOpts.DEFAULT_RUNTIME_ATTRIBUTES] if backend: template['backend'] = backend wdl_parser = CaperWDLParser(wdl) # sanity check for environment flags defined_env_flags = [ env for env in (docker, singularity, conda) if env ] if len(defined_env_flags) > 1: raise ValueError( 'docker, singularity and conda are mutually exclusive. ' 'Define nothing or only one environment.') if docker is not None: environment = ENVIRONMENT_DOCKER elif singularity is not None: environment = ENVIRONMENT_SINGULARITY elif conda is not None: environment = ENVIRONMENT_CONDA else: environment = None if environment: default_runtime_attributes['environment'] = environment if docker == '' or backend in (BACKEND_GCP, BACKEND_AWS) and not docker: # if used as a flag or cloud backend is chosen # try to find "default_docker" from WDL's workflow.meta or "#CAPER docker" from comments docker = wdl_parser.default_docker if docker: logger.info( 'Docker image found in WDL metadata. wdl={wdl}, d={d}'. format(wdl=wdl, d=docker)) else: logger.info( "Docker image not found in WDL metadata. wdl={wdl}".format( wdl=wdl)) if docker: default_runtime_attributes['docker'] = docker if singularity == '': # if used as a flag if backend in (BACKEND_GCP, BACKEND_AWS): raise ValueError( 'Singularity cannot be used for cloud backend (e.g. aws, gcp).' ) singularity = wdl_parser.default_singularity if singularity: logger.info( 'Singularity image found in WDL metadata. wdl={wdl}, s={s}' .format(wdl=wdl, s=singularity)) else: logger.info( 'Singularity image not found in WDL metadata. wdl={wdl}.'. format(wdl=wdl)) if singularity: default_runtime_attributes['singularity'] = singularity if inputs: default_runtime_attributes[ 'singularity_bindpath'] = find_bindpath(inputs) if conda == '': # if used as a flag if backend in (BACKEND_GCP, BACKEND_AWS): raise ValueError( 'Conda cannot be used for cloud backend (e.g. aws, gcp).') conda = wdl_parser.default_conda if conda: logger.info( 'Conda environment name found in WDL metadata. wdl={wdl}, s={s}' .format(wdl=wdl, s=conda)) else: logger.info( 'Conda environment name not found in WDL metadata. wdl={wdl}' .format(wdl=wdl)) if conda: default_runtime_attributes['conda'] = conda if max_retries is not None: default_runtime_attributes['maxRetries'] = max_retries # Cromwell's bug in memory-retry feature. # Disabled until it's fixed on Cromwell's side. # if memory_retry_multiplier is not None: # template['memory_retry_multiplier'] = memory_retry_multiplier if gcp_monitoring_script and backend == BACKEND_GCP: if not GCSURI(gcp_monitoring_script).is_valid: raise ValueError( 'gcp_monitoring_script is not a valid URI. {uri}'.format( uri=gcp_monitoring_script)) template['monitoring_script'] = gcp_monitoring_script if custom_options: s = AutoURI(custom_options).read() d = json.loads(s) merge_dict(template, d) final_options_file = os.path.join(directory, basename) AutoURI(final_options_file).write( json.dumps(template, indent=4) + '\n') return final_options_file