コード例 #1
0
ファイル: digester.py プロジェクト: whmzsu/cromwell
def digest(workflow_path: ComparisonPath, operations_path: ComparisonPath) -> JsonObject:
    def call_fn(succeeded_operations: Dict[CallName, JsonObject],
                operation_id: OperationId,
                path: CallNameSequence,
                attempt: JsonObject) -> None:
        backend_status = attempt.get('backendStatus', 'Unknown')
        # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status
        # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored.
        # It's possible that a future version of the digester might actually want to look at these jobs since they
        # may have completed some lifecycle events which could be useful in accumulating more performance data.
        if backend_status == 'Success':
            string_path = '.'.join(path)
            cromwell_start = attempt.get('start')
            cromwell_end = attempt.get('end')

            cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) -
                                           dateutil.parser.parse(cromwell_start)).total_seconds()

            bare_operation_id = operation_id.split('/')[-1]
            operations_file_path = operations_path / f'{bare_operation_id}.json'
            operations_data = operations_file_path.read_text()
            operations_metadata = json.loads(operations_data)
            operation = OperationDigester.create(operations_metadata)

            papi_total_time_seconds = operation.total_time_seconds()

            cromwell_additional_total_time_seconds = \
                float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds))

            succeeded_operations[string_path] = {
                Attempt: attempt.get('attempt'),
                CromwellAdditionalTotalTimeSeconds: cromwell_additional_total_time_seconds,
                CromwellEnd: cromwell_end,
                CromwellStart: cromwell_start,
                CromwellTotalTimeSeconds: cromwell_total_time_seconds,
                DelocalizationTimeSeconds: operation.delocalization_time_seconds(),
                Disks: operation.disks(),
                DockerImagePullTimeSeconds: operation.docker_image_pull_time_seconds(),
                LocalizationTimeSeconds: operation.localization_time_seconds(),
                MachineType: operation.machine_type(),
                OperationIdKey: operation_id,
                OtherTimeSeconds: operation.other_time_seconds(),
                PapiCreate: operation.create_time(),
                PapiEnd: operation.end_time(),
                PapiStart: operation.start_time(),
                PapiTotalTimeSeconds: operation.total_time_seconds(),
                ShardIndex: attempt.get('shardIndex'),
                StartupTimeSeconds: operation.startup_time_seconds(),
                UserCommandTimeSeconds: operation.user_command_time_seconds(),
            }

    data = workflow_path.read_text()
    metadata = json.loads(data)

    shards = operation_ids.visit_papi_operations(metadata, call_fn, initial_accumulator={})
    return {'version': Version, 'calls': shards, 'workflowId': metadata['id']}
コード例 #2
0
def gcs_parent(subdir: AnyStr,
               gcs_comparison_path_by_subdir: dict) -> ComparisonPath:
    """
    GcsComparisonPaths are somewhat expensive to create so cache them.
    """
    if subdir not in gcs_comparison_path_by_subdir:
        path = ComparisonPath.create(
            f'gs://papi-performance-analysis/{subdir}')
        gcs_comparison_path_by_subdir[subdir] = path
    return gcs_comparison_path_by_subdir[subdir]
コード例 #3
0
ファイル: digester.py プロジェクト: whmzsu/cromwell
def main(args: argparse.Namespace) -> None:
    for path in args.paths:
        parent_path = ComparisonPath.create(path)

        workflow_path = parent_path / 'workflow.json'
        operations_dir_path = parent_path / 'operations'

        digest_parent = parent_path / 'digests' / Version
        digest_path = digest_parent / 'digest.json'

        if not digest_path.exists() or args.force:
            digest_parent.mkdir_p()
            digest_json = digest(workflow_path, operations_dir_path)
            digest_string = json.dumps(digest_json, sort_keys=True, indent=4)
            digest_path.write_text(digest_string)
        else:
            raise ValueError(f'digest file already exists at {digest_path} and --force not specified')
コード例 #4
0
    def test_operations_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive operations digester testing.
        The metadata is stored in GCS and copied down to the local machine if not already present from an earlier run.
        Operations digesters can run against either local or GCS paths using `ComparisonPath`s. Since GCS testing is
        slow it's turned off by default, it can be turned on by setting the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent, bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running operation digester on {description} sample '{sample_name}' backend {papi_version}")
                    sample_path = parent / sample_name

                    for operation in EXPECTATIONS.get(sample_name).get(papi_version).keys():
                        operations_path = sample_path / 'operations' / f'{operation}.json'
                        json_str = operations_path.read_text()
                        op_digester = OperationDigester.create(json.loads(json_str))
                        for key, value in EXPECTATIONS.get(sample_name).get(papi_version).get(operation).items():
                            method_to_call = getattr(op_digester, key)
                            self.assertEqual(method_to_call(), value, f'{key} was not {value}')
コード例 #5
0
    def test_digestion(self) -> None:
        """
        This uses "real" metadata from the PAPI v2 performance spike to drive digester testing. The metadata is stored
        in GCS and copied down to the local machine if not already present from an earlier run. The digester can run
        against either local or GCS paths using `ComparisonPath`s. Local is nicer to iterate on than GCS since it
        runs so much more quickly. Since GCS testing is slow it's turned off by default, it can be turned on by setting
        the DIGESTER_TEST_GCS environment variable.
        """

        credentials, project_id = google.auth.default()
        storage_client = storage.Client(credentials=credentials)

        bucket_name = 'papi-performance-analysis'
        bucket = storage_client.get_bucket(bucket_name)

        # A cache of expensive-to-create GCS comparison paths.
        gcs_comparison_path_by_subdir = {}
        papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2]

        for papi_version in papi_versions:
            subdir = subdir_for_papi_version(papi_version)
            local_parent = ComparisonPath.create(subdir)

            for sample_name in EXPECTATIONS.keys():
                download_metadata_from_gcs_if_needed(sample_name, local_parent,
                                                     bucket)
                parents_to_test = [local_parent]
                # Skip slow GCS testing unless this environment variable is set.
                if os.environ.get('DIGESTER_TEST_GCS'):
                    parents_to_test.append(
                        gcs_parent(subdir, gcs_comparison_path_by_subdir))

                for parent in parents_to_test:
                    description = parent.description()
                    logging.info(
                        f"Running digester test on {description} for sample '{sample_name}' on backend {papi_version}"
                    )
                    sample_path = parent / sample_name
                    workflow_path = sample_path / 'workflow.json'
                    operations_path = sample_path / 'operations'
                    actual = digest(workflow_path, operations_path)

                    expected = EXPECTATIONS[sample_name][papi_version]
                    calls: JsonObject = actual.get('calls')

                    actual_total = len(calls)
                    self.assertEqual(actual_total, expected['total_jobs'])

                    for num_attempts in [1, 2, 3]:
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_attempts(calls, num_attempts),
                                    calls)))
                        self.assertEqual(
                            actual_len,
                            expected[f'more_than_{num_attempts}_attempts'])

                    for minutes_longer in range(3, 9):
                        actual_len = len(
                            list(
                                filter(
                                    more_than_x_minutes_longer(
                                        calls, minutes_longer), calls)))
                        expectation = expected[
                            f'cromwell_time_more_than_{minutes_longer}_minutes_longer_total']
                        self.assertEqual(actual_len, expectation)

                    # Currently just a smoke test to assert not-completely-insane results for both v1 and v2 digesters.

                    keys = [
                        StartupTimeSeconds, DockerImagePullTimeSeconds,
                        LocalizationTimeSeconds, UserCommandTimeSeconds,
                        DelocalizationTimeSeconds, PapiTotalTimeSeconds,
                        CromwellTotalTimeSeconds, OtherTimeSeconds
                    ]

                    for key in keys:
                        for name in calls:
                            self.assertTrue(
                                calls[name].get(key) >= 0,
                                f"failed for {papi_version} / {sample_name} / {key}"
                            )
コード例 #6
0
def json_from_path(string: AnyStr) -> ComparisonPath:
    path = ComparisonPath.create(str(RESOURCES) + '/comparer/' + string)
    return json.loads(path.read_text())
コード例 #7
0
 def __read_resource(filename: str) -> AnyStr:
     return (ComparisonPath.create(str(RESOURCES)) / "comparer" / filename).read_text()
コード例 #8
0
ファイル: comparer.py プロジェクト: ylpduxinghua/cromwell
def json_from_path_string(path_string: AnyStr) -> JsonObject:
    path = ComparisonPath.create(path_string)
    return json.loads(path.read_text())
コード例 #9
0
ファイル: comparer.py プロジェクト: ylpduxinghua/cromwell
                        required=True,
                        help='Path to output CSV file.')
    parser.add_argument('--call-prefix-to-remove',
                        metavar='CALL_PREFIX_TO_REMOVE',
                        type=str,
                        nargs='*',
                        help='Call prefix to remove if present.')

    args = parser.parse_args()
    set_log_verbosity(args.verbose)
    quieten_chatty_imports()
    logger.info("Starting Comparer operation.")

    _json_1, _json_2 = [
        json_from_path_string(p[0]) for p in [args.digest1, args.digest2]
    ]

    prefixes = [] if not args.call_prefix_to_remove else args.call_prefix_to_remove

    comparison_data = compare_jsons(_json_1, _json_2, args.name1[0],
                                    args.name2[0], prefixes, args.force)
    out_path = args.output_path[0]
    out = ComparisonPath.create(out_path)
    if out.exists() and not args.force:
        raise ValueError(
            f"Specified output file '{out_path}' already exists and --force not specified."
        )
    out.write_text(csv_string_from_data(comparison_data))

    logger.info('Comparer operation completed successfully.')
コード例 #10
0
 def validate_path(p: AnyStr) -> AnyStr:
     if ComparisonPath.is_valid_path_string(p):
         return p
     raise ValueError(f'{p} is not a valid path whatsoever')