예제 #1
0
                def add_to_input_files_if_valid(file):
                    nonlocal file_size_cache
                    nonlocal file_sizes_dict
                    nonlocal input_name

                    if GCSURI(file).is_valid:
                        file_size = file_size_cache.get(file)
                        if file_size is None:
                            file_size = GCSURI(file).size
                            file_size_cache[file] = file_size
                        file_sizes_dict[input_name].append(file_size)
예제 #2
0
    def __init__(
        self,
        local_loc_dir=None,
        gcp_loc_dir=None,
        aws_loc_dir=None,
        gcp_service_account_key_json=None,
    ):
        """Manages work/cache/temp directories for localization on the following
        storages:
            - Local*: Local path -> local_loc_dir**
            - gcp: GCS bucket path -> gcp_loc_dir
            - aws: S3 bucket path -> aws_loc_dir

        * Note that it starts with capital L, which is a default backend of Cromwell's
        default configuration file (application.conf).
        ** /tmp is not recommended. This directory is very important to store
        intermediate files used by Cromwell/AutoURI (file transfer/localization).

        Also manages Google Cloud auth (key JSON file) since both Caper client/server
        require permission to access to storage.

        Args:
            local_loc_dir:
                Local cache directory to store files localized for local backends.
                Unlike other two directories. This directory is also used to make a
                working directory to store intermediate files to run Cromwell.
                e.g. backend.conf and workflow_opts.json.
            gcp_loc_dir:
                GCS cache directory to store files localized on GCS for gcp backend.
            aws_loc_dir:
                S3 cache directory to store files localized on S3 for aws backend.
            gcp_service_account_key_json:
                Google Cloud service account for authentication.
                This service account should have enough permission to storage.
        """
        if local_loc_dir is None:
            local_loc_dir = os.path.join(os.getcwd(),
                                         CaperBase.DEFAULT_LOC_DIR_NAME)

        if not AbsPath(local_loc_dir).is_valid:
            raise ValueError(
                'local_loc_dir should be a valid local abspath. {f}'.format(
                    f=local_loc_dir))
        if gcp_loc_dir and not GCSURI(gcp_loc_dir).is_valid:
            raise ValueError(
                'gcp_loc_dir should be a valid GCS path. {f}'.format(
                    f=gcp_loc_dir))
        if aws_loc_dir and not S3URI(aws_loc_dir).is_valid:
            raise ValueError(
                'aws_loc_dir should be a valid S3 path. {f}'.format(
                    f=aws_loc_dir))

        self._local_loc_dir = local_loc_dir
        self._gcp_loc_dir = gcp_loc_dir
        self._aws_loc_dir = aws_loc_dir

        self._set_env_gcp_app_credentials(gcp_service_account_key_json)
예제 #3
0
def init_autouri(args):
    """Initialize Autouri and its logger

    Args:
        args:
            dict of cmd line arguments
    """
    GCSURI.init_gcsuri(use_gsutil_for_s3=args['use_gsutil_for_s3'])

    # autouri's path to url mapping
    if args['tsv_mapping_path_to_url'] is not None:
        mapping_path_to_url = {}
        f = os.path.expanduser(args['tsv_mapping_path_to_url'])
        with open(f, newline='') as fp:
            reader = csv.reader(fp, delimiter='\t')
            for line in reader:
                mapping_path_to_url[line[0]] = line[1]
        args['mapping_path_to_url'] = mapping_path_to_url
    else:
        args['mapping_path_to_url'] = None
예제 #4
0
        def gcp_monitor_call(call_name, call, parent_call_names):
            nonlocal excluded_cols
            nonlocal stat_methods
            nonlocal file_size_cache
            nonlocal workflow_id
            nonlocal task_name

            if task_name and task_name != call_name:
                return

            monitoring_log = call.get('monitoringLog')
            if monitoring_log is None:
                return
            if not GCSURI(monitoring_log).is_valid:
                # This feature is for GCSURI only.
                return
            if not GCSURI(monitoring_log).exists:
                # Workaround for Cromwell-52's bug.
                # Call-cached task has `monitoringLog`, but it does not exist.
                return

            dataframe = pd.read_csv(
                io.StringIO(GCSURI(monitoring_log).read()), delimiter='\t'
            )
            rt_attrs = call.get('runtimeAttributes')

            data = {
                'workflow_id': workflow_id,
                'task_name': call_name,
                'shard_idx': call.get('shardIndex'),
                'status': call.get('executionStatus'),
                'attempt': call.get('attempt'),
                'instance': {
                    'cpu': int(rt_attrs.get('cpu')),
                    'disk': parse_cromwell_disks(rt_attrs.get('disks')),
                    'mem': parse_cromwell_memory(rt_attrs.get('memory')),
                },
                'stats': {s: {} for s in stat_methods},
                'input_file_sizes': defaultdict(list),
            }
            for i, col_name in enumerate(dataframe.columns):
                if i in excluded_cols:
                    continue
                for stat_method in stat_methods:
                    if dataframe.empty:
                        val = None
                    elif stat_method == 'last':
                        last_idx = dataframe.tail(1).index.item()
                        val = dataframe[col_name][last_idx]
                    else:
                        val = getattr(dataframe[col_name], stat_method)()
                    data['stats'][stat_method][col_name] = val

            for input_name, input_value in sorted(call['inputs'].items()):
                file_sizes_dict = data['input_file_sizes']

                def add_to_input_files_if_valid(file):
                    nonlocal file_size_cache
                    nonlocal file_sizes_dict
                    nonlocal input_name

                    if GCSURI(file).is_valid:
                        file_size = file_size_cache.get(file)
                        if file_size is None:
                            file_size = GCSURI(file).size
                            file_size_cache[file] = file_size
                        file_sizes_dict[input_name].append(file_size)

                recurse_dict_value(input_value, add_to_input_files_if_valid)

            return data
예제 #5
0
파일: cli.py 프로젝트: jonahcullen/caper
def init_autouri(args):
    if hasattr(args, 'use_gsutil_for_s3'):
        GCSURI.init_gcsuri(use_gsutil_for_s3=args.use_gsutil_for_s3)
예제 #6
0
    def create_file(
        self,
        directory,
        wdl,
        backend=None,
        inputs=None,
        custom_options=None,
        docker=None,
        singularity=None,
        singularity_cachedir=None,
        no_build_singularity=False,
        max_retries=DEFAULT_MAX_RETRIES,
        memory_retry_multiplier=DEFAULT_MEMORY_RETRY_MULTIPLIER,
        gcp_monitoring_script=DEFAULT_GCP_MONITORING_SCRIPT,
        basename=BASENAME_WORKFLOW_OPTS_JSON,
    ):
        """Creates Cromwell's workflow options JSON file.
        Workflow options JSON file sets default values for attributes
        defined in runtime {} section of WDL's task.
        For example, docker attribute can be defined here instead of directory
        defining in task's runtime { docker: "" }.

        Args:
            directory:
                Directory to make workflow options JSON file.
            wdl:
                WDL file.
            backend:
                Backend to run a workflow on. If not defined, server's default or
                runner's Local backend will be used.
            inputs:
                Input JSON file to define input files/parameters for WDL.
                This will be overriden by environment variable SINGULARITY_BINDPATH.
                For Singularity, it is required to find SINGULARITY_BINDPATH,
                which is a comma-separated list of common root directories
                for all files defined in input JSON.
                Unlike Docker, Singularity binds directories instead of mounting them.
                Therefore, Caper will try to find an optimal SINGULARITY_BINDPATH
                by looking at all files paths and find common parent directories for them.
            custom_options:
                User's custom workflow options JSON file.
                This will be merged at the end of this function.
                Therefore, users can override on Caper's auto-generated
                workflow options JSON file.
            docker:
                Docker image to run a workflow on.
            singularity:
                Singularity image to run a workflow on.
            singularity_cachedir:
                Singularity cache directory to build local images on.
                This will be overriden by environment variable SINGULARITY_CACHEDIR.
            no_build_singularity:
                Caper run "singularity exec IMAGE" to build a local Singularity image
                before submitting/running a workflow.
                With this flag on, Caper does not pre-build a local Singularity container.
                Therefore, Singularity container will be built inside each task,
                which will result in multiple redundant local image building.
                Also, trying to build on the same Singularity image file can
                lead to corruption of the image file.
            max_retries:
                Maximum number of retirals for each task. 1 means 1 retrial.
            memory_retry_multiplier:
                Multiplier for the memory retry feature.
                See https://cromwell.readthedocs.io/en/develop/cromwell_features/RetryWithMoreMemory/
                for details.
            gcp_monitoring_script:
                Monitoring script for GCP backend only.
                Useful to monitor resources on an instance.
            basename:
                Basename for a temporary workflow options JSON file.
        """
        if singularity and docker:
            raise ValueError('Cannot use both Singularity and Docker.')

        template = copy.deepcopy(self._template)
        dra = template[CaperWorkflowOpts.DEFAULT_RUNTIME_ATTRIBUTES]

        if backend:
            template['backend'] = backend

        wdl_parser = CaperWDLParser(wdl)
        if docker == '' or backend in (BACKEND_GCP,
                                       BACKEND_AWS) and not docker:
            # find "caper-docker" from WDL's workflow.meta
            # or "#CAPER docker" from comments
            docker = wdl_parser.caper_docker
            if docker:
                logger.info(
                    'Docker image found in WDL\'s metadata. wdl={wdl}, d={d}'.
                    format(wdl=wdl, d=docker))
            else:
                logger.warning(
                    "Docker image not found in WDL's metadata, which means that "
                    "docker is not defined either as comment (#CAPER docker) or "
                    "in workflow's meta section (under key caper_docker) in WDL. "
                    "If your WDL already has docker defined "
                    "in each task's runtime "
                    "then it should be okay. wdl={wdl}".format(wdl=wdl))
        if docker:
            dra['docker'] = docker

        if singularity == '':
            if backend in (BACKEND_GCP, BACKEND_AWS):
                raise ValueError(
                    'Singularity cannot be used for cloud backend (e.g. aws, gcp).'
                )

            singularity = wdl_parser.caper_singularity
            if singularity:
                logger.info(
                    'Singularity image found in WDL\'s metadata. wdl={wdl}, s={s}'
                    .format(wdl=wdl, s=singularity))
            else:
                raise ValueError(
                    'Singularity image not found in WDL. wdl={wdl}'.format(
                        wdl=wdl))
        if singularity:
            dra['singularity'] = singularity
            if singularity_cachedir:
                dra['singularity_cachedir'] = singularity_cachedir

            s = Singularity(singularity, singularity_cachedir)
            if inputs:
                dra['singularity_bindpath'] = s.find_bindpath(inputs)
            if not no_build_singularity:
                s.build_local_image()

        if max_retries is not None:
            dra['maxRetries'] = max_retries
        # Cromwell's bug in memory-retry feature.
        # Disabled until it's fixed on Cromwell's side.
        # if memory_retry_multiplier is not None:
        #     template['memory_retry_multiplier'] = memory_retry_multiplier

        if gcp_monitoring_script and backend == BACKEND_GCP:
            if not GCSURI(gcp_monitoring_script).is_valid:
                raise ValueError(
                    'gcp_monitoring_script is not a valid URI. {uri}'.format(
                        uri=gcp_monitoring_script))
            template['monitoring_script'] = gcp_monitoring_script

        if custom_options:
            s = AutoURI(custom_options).read()
            d = json.loads(s)
            merge_dict(template, d)

        final_options_file = os.path.join(directory, basename)
        AutoURI(final_options_file).write(
            json.dumps(template, indent=4) + '\n')

        return final_options_file
예제 #7
0
def test_run_gcp_with_life_sciences_api(
    tmp_path,
    gcs_root,
    ci_prefix,
    cromwell,
    womtool,
    gcp_prj,
    gcp_service_account_key_json,
    debug_caper,
):
    """Test run with Google Cloud Life Sciences API
    """
    out_gcs_bucket = os.path.join(gcs_root, 'caper_out', ci_prefix)
    tmp_gcs_bucket = os.path.join(gcs_root, 'caper_tmp')

    # prepare WDLs and input JSON, imports to be submitted
    make_directory_with_wdls(str(tmp_path))
    wdl = tmp_path / 'main.wdl'
    inputs = tmp_path / 'inputs.json'
    metadata = tmp_path / 'metadata.json'

    cmd = ['run', str(wdl)]
    cmd += ['--inputs', str(inputs)]
    cmd += ['-m', str(metadata)]
    if gcp_service_account_key_json:
        cmd += ['--gcp-service-account-key-json', gcp_service_account_key_json]
    cmd += ['--use-google-cloud-life-sciences']
    cmd += ['--gcp-region', 'us-central1']
    # --gcp-zones should be ignored
    cmd += ['--gcp-zones', 'us-west1-a,us-west1-b']
    cmd += ['--gcp-prj', gcp_prj]
    cmd += ['--gcp-memory-retry-error-keys', 'Killed']
    cmd += ['--gcp-memory-retry-multiplier', '1.5']
    cmd += ['--tmp-dir', str(tmp_path / 'tmp_dir')]
    cmd += ['--backend', 'gcp']
    cmd += ['--gcp-out-dir', out_gcs_bucket]
    cmd += ['--gcp-loc-dir', tmp_gcs_bucket]
    cmd += ['--cromwell-stdout', str(tmp_path / 'cromwell_stdout.o')]
    # test with file type DB
    cmd += ['--db', 'file']
    cmd += ['--db-timeout', '500000']
    cmd += ['--file-db', str(tmp_path / 'file_db_prefix')]
    cmd += ['--max-concurrent-tasks', '2']
    cmd += ['--max-concurrent-workflows', '2']
    cmd += ['--disable-call-caching']
    cmd += ['--cromwell', cromwell]
    cmd += ['--womtool', womtool]
    cmd += ['--java-heap-run', '4G']
    cmd += ['--docker', 'ubuntu:latest']
    if debug_caper:
        cmd += ['--debug']
    print(' '.join(cmd))

    cli_main(cmd)
    m_dict = json.loads(metadata.read_text())

    assert m_dict['status'] == 'Succeeded'

    # test CromwellMetadata.gcp_monitor() here
    # since it's for gcp only and this function is one of the two
    # test functions ran on a gcp backend.
    # task main.t1 has sleep 10 so that monitoring_script has time to
    # write monitoring data to `monitoringLog` file
    cm = CromwellMetadata(m_dict)
    monitor_data = cm.gcp_monitor()
    for data in monitor_data:
        instance_cpu = data['instance']['cpu']
        instance_mem = data['instance']['mem']
        instance_disk = data['instance']['disk']
        assert instance_cpu >= 1
        assert instance_mem >= 1024 * 1024 * 1024
        assert instance_disk >= 10 * 1024 * 1024 * 1024

        max_cpu_percent = data['stats']['max']['cpu_pct']
        max_mem = data['stats']['max']['mem']
        max_disk = data['stats']['max']['disk']
        if max_cpu_percent or data['task_name'] == 'main.t1':
            assert max_cpu_percent <= 100.0
        if max_mem or data['task_name'] == 'main.t1':
            assert max_mem <= instance_mem
        if max_disk or data['task_name'] == 'main.t1':
            assert max_disk <= instance_disk

    # test cleanup on gcp backend (gs://)
    root_out_dir = cm.data['workflowRoot']

    # remote metadata JSON file on workflow's root output dir.
    remote_metadata_json_file = os.path.join(root_out_dir, 'metadata.json')
    assert GCSURI(remote_metadata_json_file).exists

    # dry-run should not delete anything
    cm.cleanup(dry_run=True)
    assert GCSURI(remote_metadata_json_file).exists

    cm.cleanup(dry_run=False)
    assert not GCSURI(remote_metadata_json_file).exists
예제 #8
0
    def create_file(
        self,
        directory,
        wdl,
        backend=None,
        inputs=None,
        custom_options=None,
        docker=None,
        singularity=None,
        conda=None,
        max_retries=DEFAULT_MAX_RETRIES,
        memory_retry_multiplier=DEFAULT_MEMORY_RETRY_MULTIPLIER,
        gcp_monitoring_script=DEFAULT_GCP_MONITORING_SCRIPT,
        basename=BASENAME_WORKFLOW_OPTS_JSON,
    ):
        """Creates Cromwell's workflow options JSON file.
        Workflow options JSON file sets default values for attributes
        defined in runtime {} section of WDL's task.
        For example, docker attribute can be defined here instead of directory
        defining in task's runtime { docker: "" }.

        Args:
            directory:
                Directory to make workflow options JSON file.
            wdl:
                WDL file.
            backend:
                Backend to run a workflow on. If not defined, server's default or
                runner's Local backend will be used.
            inputs:
                Input JSON file to define input files/parameters for WDL.
                This will be overriden by environment variable SINGULARITY_BINDPATH.
                For Singularity, it is required to find SINGULARITY_BINDPATH,
                which is a comma-separated list of common root directories
                for all files defined in input JSON.
                Unlike Docker, Singularity binds directories instead of mounting them.
                Therefore, Caper will try to find an optimal SINGULARITY_BINDPATH
                by looking at all files paths and find common parent directories for them.
            custom_options:
                User's custom workflow options JSON file.
                This will be merged at the end of this function.
                Therefore, users can override on Caper's auto-generated
                workflow options JSON file.
            conda:
                Default Conda environemnt name to run a workflow.
            docker:
                Default Docker image to run a workflow on.
            singularity:
                Default Singularity image to run a workflow on.
            max_retries:
                Maximum number of retirals for each task. 1 means 1 retrial.
            memory_retry_multiplier:
                Multiplier for the memory retry feature.
                See https://cromwell.readthedocs.io/en/develop/cromwell_features/RetryWithMoreMemory/
                for details.
            gcp_monitoring_script:
                Monitoring script for GCP backend only.
                Useful to monitor resources on an instance.
            basename:
                Basename for a temporary workflow options JSON file.
        """
        if singularity and docker:
            raise ValueError('Cannot use both Singularity and Docker.')

        template = copy.deepcopy(self._template)
        default_runtime_attributes = template[
            CaperWorkflowOpts.DEFAULT_RUNTIME_ATTRIBUTES]

        if backend:
            template['backend'] = backend

        wdl_parser = CaperWDLParser(wdl)

        # sanity check for environment flags
        defined_env_flags = [
            env for env in (docker, singularity, conda) if env
        ]
        if len(defined_env_flags) > 1:
            raise ValueError(
                'docker, singularity and conda are mutually exclusive. '
                'Define nothing or only one environment.')

        if docker is not None:
            environment = ENVIRONMENT_DOCKER
        elif singularity is not None:
            environment = ENVIRONMENT_SINGULARITY
        elif conda is not None:
            environment = ENVIRONMENT_CONDA
        else:
            environment = None

        if environment:
            default_runtime_attributes['environment'] = environment

        if docker == '' or backend in (BACKEND_GCP,
                                       BACKEND_AWS) and not docker:
            # if used as a flag or cloud backend is chosen
            # try to find "default_docker" from WDL's workflow.meta or "#CAPER docker" from comments
            docker = wdl_parser.default_docker
            if docker:
                logger.info(
                    'Docker image found in WDL metadata. wdl={wdl}, d={d}'.
                    format(wdl=wdl, d=docker))
            else:
                logger.info(
                    "Docker image not found in WDL metadata. wdl={wdl}".format(
                        wdl=wdl))

        if docker:
            default_runtime_attributes['docker'] = docker

        if singularity == '':
            # if used as a flag
            if backend in (BACKEND_GCP, BACKEND_AWS):
                raise ValueError(
                    'Singularity cannot be used for cloud backend (e.g. aws, gcp).'
                )

            singularity = wdl_parser.default_singularity
            if singularity:
                logger.info(
                    'Singularity image found in WDL metadata. wdl={wdl}, s={s}'
                    .format(wdl=wdl, s=singularity))
            else:
                logger.info(
                    'Singularity image not found in WDL metadata. wdl={wdl}.'.
                    format(wdl=wdl))

        if singularity:
            default_runtime_attributes['singularity'] = singularity
            if inputs:
                default_runtime_attributes[
                    'singularity_bindpath'] = find_bindpath(inputs)

        if conda == '':
            # if used as a flag
            if backend in (BACKEND_GCP, BACKEND_AWS):
                raise ValueError(
                    'Conda cannot be used for cloud backend (e.g. aws, gcp).')
            conda = wdl_parser.default_conda
            if conda:
                logger.info(
                    'Conda environment name found in WDL metadata. wdl={wdl}, s={s}'
                    .format(wdl=wdl, s=conda))
            else:
                logger.info(
                    'Conda environment name not found in WDL metadata. wdl={wdl}'
                    .format(wdl=wdl))

        if conda:
            default_runtime_attributes['conda'] = conda

        if max_retries is not None:
            default_runtime_attributes['maxRetries'] = max_retries
        # Cromwell's bug in memory-retry feature.
        # Disabled until it's fixed on Cromwell's side.
        # if memory_retry_multiplier is not None:
        #     template['memory_retry_multiplier'] = memory_retry_multiplier

        if gcp_monitoring_script and backend == BACKEND_GCP:
            if not GCSURI(gcp_monitoring_script).is_valid:
                raise ValueError(
                    'gcp_monitoring_script is not a valid URI. {uri}'.format(
                        uri=gcp_monitoring_script))
            template['monitoring_script'] = gcp_monitoring_script

        if custom_options:
            s = AutoURI(custom_options).read()
            d = json.loads(s)
            merge_dict(template, d)

        final_options_file = os.path.join(directory, basename)
        AutoURI(final_options_file).write(
            json.dumps(template, indent=4) + '\n')

        return final_options_file