def digest(workflow_path: ComparisonPath, operations_path: ComparisonPath) -> JsonObject: def call_fn(succeeded_operations: Dict[CallName, JsonObject], operation_id: OperationId, path: CallNameSequence, attempt: JsonObject) -> None: backend_status = attempt.get('backendStatus', 'Unknown') # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored. # It's possible that a future version of the digester might actually want to look at these jobs since they # may have completed some lifecycle events which could be useful in accumulating more performance data. if backend_status == 'Success': string_path = '.'.join(path) cromwell_start = attempt.get('start') cromwell_end = attempt.get('end') cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) - dateutil.parser.parse(cromwell_start)).total_seconds() bare_operation_id = operation_id.split('/')[-1] operations_file_path = operations_path / f'{bare_operation_id}.json' operations_data = operations_file_path.read_text() operations_metadata = json.loads(operations_data) operation = OperationDigester.create(operations_metadata) papi_total_time_seconds = operation.total_time_seconds() cromwell_additional_total_time_seconds = \ float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds)) succeeded_operations[string_path] = { Attempt: attempt.get('attempt'), CromwellAdditionalTotalTimeSeconds: cromwell_additional_total_time_seconds, CromwellEnd: cromwell_end, CromwellStart: cromwell_start, CromwellTotalTimeSeconds: cromwell_total_time_seconds, DelocalizationTimeSeconds: operation.delocalization_time_seconds(), Disks: operation.disks(), DockerImagePullTimeSeconds: operation.docker_image_pull_time_seconds(), LocalizationTimeSeconds: operation.localization_time_seconds(), MachineType: operation.machine_type(), OperationIdKey: operation_id, OtherTimeSeconds: operation.other_time_seconds(), PapiCreate: operation.create_time(), PapiEnd: operation.end_time(), PapiStart: operation.start_time(), PapiTotalTimeSeconds: operation.total_time_seconds(), ShardIndex: attempt.get('shardIndex'), StartupTimeSeconds: operation.startup_time_seconds(), UserCommandTimeSeconds: operation.user_command_time_seconds(), } data = workflow_path.read_text() metadata = json.loads(data) shards = operation_ids.visit_papi_operations(metadata, call_fn, initial_accumulator={}) return {'version': Version, 'calls': shards, 'workflowId': metadata['id']}
def find_operation_ids_in_metadata( json_metadata: JsonObject) -> Sequence[AnyStr]: """Finds all instances of PAPI operations IDs in a workflow""" # Eg given: # { # "calls": { # "workflow_name.task_name": [ # { # "jobId": "projects/broad-dsde-cromwell-dev/operations/01234567891011121314", # ... # # We want to extract "projects/broad-dsde-cromwell-dev/operations/01234567891011121314" def call_fn(acc: List[AnyStr], operation_id: OperationId, call_name_sequence: CallNameSequence, attempt: JsonObject) -> None: acc.append(operation_id) return visit_papi_operations(json_metadata, call_fn, initial_accumulator=[])