Ejemplo n.º 1
0
def parse_workload(pegasus_db_path, filename=NAME, workload_domain="", workload_description=""):
    if not os.path.exists(TARGET_DIR):
        os.makedirs(TARGET_DIR)

    conn = sqlite3.connect(pegasus_db_path)
    c = conn.cursor()

    workflows = parse_workflows(c)

    for w in workflows:
        w.compute_critical_path()

    # Write the workflow objects to parquet
    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True)
    workflow_df = pd.DataFrame([workflow.get_parquet_dict() for workflow in workflows])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow")

    # Write all tasks to parquet
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame([task.get_parquet_dict() for wf in workflows for task in wf.tasks])
    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    pyarrow_task_schema = Task.get_pyarrow_schema()
    table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False)

    # Pandas does not know the different between an empty list and a list with integers
    # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly
    # using a schema.
    pq.write_table(table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"))

    # generate workload description
    authors_list = []

    w = Workload(workflows, workload_domain, authors_list, workload_description)

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(task_df,
                                                                  domain="Scientific",
                                                                  authors=["Pegasus Team"],
                                                                  workload_description=""  # TODO fill in
                                                                  )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)

    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))

    conn.close()
def parse(source_dir, interarrival_times):
    inter_arrivals = []

    with open(interarrival_times, 'r') as arrival_file:
        for line in arrival_file:
            inter_arrivals.append(int(line))

    dax_files = os.listdir(source_dir)

    files_to_read = []
    arrival_time = 0
    for index, f in enumerate(sorted(dax_files)):
        # Ilyushkin et al. parsed the first two hundred DAX files.
        # The folder should contain 200 files, but one folder contains more than 200 files,
        # of which just the first 200 were used by Ilyushkin et al.
        if index == 200:
            break

        files_to_read.append(read_file(source_dir, f, index, arrival_time, TARGET_DIR))

        arrival_time += inter_arrivals[index]

    meta_dict = Workflow.get_parquet_meta_dict()
    meta_dict["id"] = np.int64  # Add the id because we are not using a grouped dataframe here.
    meta_dict = OrderedDict(sorted(meta_dict.items(), key=lambda t: t[0]))

    wta_tasks = pd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()))

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(wta_tasks, domain="Scientific",
                                                                authors=["Alexey Ilyushkin", "Ahmed Ali-Eldin",
                                                                         "Nikolas Herbst",
                                                                         "Alessandro Vittorio Papadopoulos",
                                                                         "Bogdan Ghit", "Dick H. J. Epema",
                                                                         "Alexandru Iosup"],
                                                                workload_description="This workload was used in the 2017 ICPE paper titles \"An experimental performance evaluation of autoscaling policies for complex workflows\" by Ilyushkin et al. It features a combination of LIGO, SIPHT, and Montage executes on the DAS5 supercomputer."
                                                                )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)

    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))
Ejemplo n.º 3
0
def parse(root, workflowhub_file_path_from_root):
    path = os.path.join(root, workflowhub_file_path_from_root)

    # Set the target dir to use the same folder structure as WorkflowHub.
    global TARGET_DIR
    TARGET_DIR = os.path.join(os.path.dirname(os.getcwd()), 'output_parquet', 'workflowhub',
                              workflowhub_file_path_from_root.rstrip(".json"))

    with open(path) as json_file:
        json_data = json.load(json_file)
        authors = [json_data['author']['name']]

    meta_dict = Workflow.get_parquet_meta_dict()
    meta_dict["id"] = np.int64  # Add the id because we are not using a grouped dataframe here.
    meta_dict = OrderedDict(sorted(meta_dict.items(), key=lambda t: t[0]))

    workflow_df = dd.from_delayed(parse_and_return_task_dataframe(path), meta=meta_dict)
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path()),
                           engine="pyarrow",
                           compute=True)

    wta_tasks = dd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()),
                                engine="pyarrow",
                                index=False)

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_dask_task_dataframe(wta_tasks, domain="Scientific",
                                                                authors=authors,
                                                                workload_description="Workload downloaded from WorkflowHub, see http://workflowhub.isi.edu/."
                                                                )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)

    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))
Ejemplo n.º 4
0
def parse(gwf_filename):
    os.makedirs(TARGET_DIR, exist_ok=True)

    gwf_tasks = pd.read_csv(gwf_filename,
                            skipinitialspace=True,
                            dtype={
                                "WorkflowID": np.int64,
                                "JobID": np.int64,
                                "SubmitTime": np.int64,
                                "Runtime": np.int64,
                                "NProcs": np.int32,
                                "Dependencies": np.str,
                            })
    del gwf_tasks["ReqNProcs "]
    gwf_tasks.columns = [
        "workflow_id", "id", "ts_submit", "runtime",
        "resource_amount_requested", "dependencies"
    ]
    gwf_tasks['resource_amount_requested'] = gwf_tasks[
        'resource_amount_requested'].astype(np.float64)

    gwf_tasks = gwf_tasks.assign(ts_submit=lambda x: x['ts_submit'] * 1000
                                 )  # Convert the submit time to milliseconds.

    gwf_tasks_with_parents = gwf_tasks.assign(
        parents=gwf_tasks["dependencies"].str.split().apply(
            lambda l: [np.int64(i) for i in l] if type(l) is list else []))

    del gwf_tasks_with_parents[
        "dependencies"]  # We need to recompute these (and the column name is wrong), so delete.

    # Add columns not present in the trace.
    gwf_tasks_with_parents["type"] = np.str("composite")
    gwf_tasks_with_parents["resource_type"] = np.str("thread")
    gwf_tasks_with_parents["submission_site"] = np.int32(0)
    gwf_tasks_with_parents["user_id"] = np.int32(-1)
    gwf_tasks_with_parents["group_id"] = np.int32(-1)
    gwf_tasks_with_parents["nfrs"] = np.str(" ")
    gwf_tasks_with_parents["wait_time"] = np.int64(-1)
    gwf_tasks_with_parents["params"] = np.str("{}")
    gwf_tasks_with_parents["memory_requested"] = np.int64(-1)
    gwf_tasks_with_parents["disk_io_time"] = np.int64(-1)
    gwf_tasks_with_parents["disk_space_requested"] = np.int64(-1)
    gwf_tasks_with_parents["energy_consumption"] = np.int64(-1)
    gwf_tasks_with_parents["network_io_time"] = np.int64(-1)
    gwf_tasks_with_parents["resource_used"] = np.str("[]")

    # We need to make sure that all pandas dataframes follow the same column order.
    # The dask dataframe is build up using different pandas dataframes. So they must match.
    gwf_tasks_with_parents = gwf_tasks_with_parents[sorted(
        gwf_tasks_with_parents.columns)]

    gwf_tasks_with_children = gwf_tasks_with_parents.groupby("workflow_id").apply(compute_children) \
        .reset_index(drop=True)

    # Make sure the first task has ts_submit of zero.
    min_submit_time = gwf_tasks_with_children["ts_submit"].min()
    task_df_final = gwf_tasks_with_children.assign(
        ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df_final.to_parquet(
        os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"))

    # Compute workflow properties specific to this trace
    workflow_df = task_df_final.groupby("workflow_id").apply(
        compute_workflow_features).reset_index(drop=True)

    workflow_df = workflow_df.rename(columns={"workflow_id": "id"})

    workflow_df = workflow_df[sorted(workflow_df.columns)]

    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df.to_parquet(
        os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"))

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df_final,
        domain="Industrial",
        authors=[
            "Shenjun Ma", "Alexey Ilyushkin", "Alexander Stegehuis",
            "Alexandru Iosup"
        ],
        workload_description=
        "Chronos is a trace from Shell's Chronos IoT production system. It contains pipelines where sensor data is obtained, checked if values are within range (e.g. temperature, operational status, etc.), and the outcomes are written to persistent storage."
    )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))
Ejemplo n.º 5
0
def parse():
    os.makedirs(TARGET_DIR, exist_ok=True)
    task_counter = 0
    workflow_counter = 0
    processed_workflows = []
    final_workflows = []
    final_tasks = []
    task_offset = 0
    workflow_offset = None

    for wi_row in WORKFLOW_INVOCATIONS.itertuples():
        flag = False
        # only use one execution of a workflow
        if wi_row[4] in processed_workflows:
            continue

        # check if workflow contains cycles
        workflow_row = WORKFLOWS.loc[(WORKFLOWS["id"] == getattr(
            wi_row, "workflow_id"))]
        if workflow_row.iloc[0]["has_cycles"] == "t":
            continue

        # workflows contain a number of workflow steps but this is not the ID of their actual execution
        # this list is used to tie the workflow steps to their actual execution ID
        step_job_ids = []

        tasks_in_workflow = []
        workflow_index = wi_row[4]
        # check if workflow id is null
        if pd.isnull(workflow_index):
            continue

        df = WORKFLOW_INVOKE_STEPS.loc[(
            WORKFLOW_INVOKE_STEPS["workflow_invocation_id"] == getattr(
                wi_row, "id"))]

        # check if workflow is not empty
        if df.empty:
            processed_workflows.append(workflow_index)
            continue

        for wis_row in df.itertuples():

            # check if entry in WF_INVOKE_STEPS has the same wf_invocation_id
            if getattr(wis_row,
                       "workflow_invocation_id") == getattr(wi_row, "id"):

                # check if required fields are not empty
                if check_if_empty(getattr(wis_row, "workflow_step_id"),
                                  getattr(wis_row, "job_id")):
                    processed_workflows.append(workflow_index)
                    flag = True
                    break

                # get step id and corresponding execution id
                step_job_pair = [
                    getattr(wis_row, "workflow_step_id"),
                    getattr(wis_row, "job_id")
                ]
                step_job_ids.append(step_job_pair)

                job_id = getattr(wis_row, "job_id")
                submit_time = int(((datetime.strptime(
                    getattr(wis_row, "create_time"), DATETIME_FORMAT) -
                                    EPOCH).total_seconds()) * 1000)
                job_metrics = METRICS.loc[(METRICS["job_id"] == job_id)]
                runtime = job_metrics.loc[
                    (job_metrics["metric_name"] == "runtime_seconds"),
                    'metric_value'] * 1000
                memory = job_metrics.loc[(job_metrics["metric_name"] ==
                                          "memory.memsw.max_usage_in_bytes"),
                                         'metric_value']
                cpu_time = job_metrics.loc[(
                    job_metrics["metric_name"] == "cpuacct.usage"),
                                           'metric_value']

                # check if any required fields are empty
                if runtime.empty or memory.empty or cpu_time.empty:
                    processed_workflows.append(workflow_index)
                    flag = True
                    break

                # used to find the task with lowest submit time, this time will be used ass offset
                if task_offset == 0:
                    task_offset = submit_time
                elif submit_time < task_offset:
                    task_offset = submit_time

                runtime = runtime.iloc[0]
                memory = memory.iloc[0]
                cpu_time = cpu_time.iloc[0] / 1000000

                if cpu_time > runtime:
                    cpu_time = runtime

                task = Task(np.int64(job_id),
                            "Composite",
                            submit_time,
                            0,
                            runtime,
                            1,
                            None,
                            workflow_index,
                            -1,
                            "cpu-time",
                            resource=cpu_time,
                            memory_requested=memory)
                task_counter += 1
                tasks_in_workflow.append(task)
                flag = False

        # if flag is true, a task in the workflow is not usable to we skip it
        if flag:
            processed_workflows.append((workflow_index))
            continue

        # compute children of tasks
        final_tasks.extend(compute_children(step_job_ids, tasks_in_workflow))

        workflow_submit_time = int(
            ((datetime.strptime(getattr(wi_row, "create_time"),
                                DATETIME_FORMAT) - EPOCH).total_seconds()) *
            1000)

        # find smallest workflow submit time as offset
        if workflow_offset is None:
            workflow_offset = workflow_submit_time
        elif workflow_submit_time < workflow_offset:
            workflow_offset = workflow_submit_time

        workflow = Workflow(workflow_index, workflow_submit_time,
                            tasks_in_workflow, "core", "Engineering", "Galaxy",
                            "Biological Engineering")
        workflow.compute_critical_path()
        processed_workflows.append(workflow_index)
        final_workflows.append(workflow)
        workflow_counter += 1

    # apply offset
    for x in final_tasks:
        x.ts_submit = x.ts_submit - task_offset

    # apply offset
    for y in final_workflows:
        y.ts_submit = y.ts_submit - workflow_offset

    # make tasks dataframe
    task_df = pd.DataFrame([t.get_parquet_dict() for t in final_tasks])

    # create parquet file in specified folder
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(),
                                    "part.0.parquet"),
                       engine="pyarrow")

    # make workflows dataframe
    workflow_df = pd.DataFrame([w.get_parquet_dict() for w in final_workflows])

    # create parquet file in specified folder
    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain="Biological Engineering",
        authors=["Jaro Bosch", "Laurens Versluis"],
        workload_description=
        "Traces from different biomedical research workflows, executed on the public Galaxy server in Europe."
    )
    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))
Ejemplo n.º 6
0
def parse_and_return_task_dataframe(file_path):
    global TARGET_DIR
    with open(file_path) as trace:
        json_data = json.load(trace)

        workflow = json_data['workflow']
        tasks = workflow['jobs']
        machines = workflow['machines']
        date = json_data['createdAt']

        # Convert the ts_submit to seconds instead of a datetime string
        task_date = dateparser.parse(date)
        EPOCH = datetime(1970, 1, 1, tzinfo=task_date.tzinfo)
        ts_submit = int((task_date - EPOCH).total_seconds() * 1000)

        resource_by_id = dict()

        for machine in machines:
            machine_id = mmh3.hash64("machine:{0}".format(machine['machine_code'].strip()))[0]
            machine = machine["machine"]
            num_cpus = machine['cpu']['count']
            details = {
                "cpu_vendor": machine['cpu']['vendor'],
                "architecture": machine['architecture']
            }
            memory_in_gb = int(machine['memory']) / float(1024 * 1024)
            res = Resource(machine_id, "cluster_node", num_cpus, machine['release'], memory_in_gb, -1, -1,
                           machine['system'], details)
            resource_by_id[machine_id] = res

        task_list = []
        task_state_list = []
        inputs_per_taskid = dict()
        outputs_per_taskid = dict()
        outputs_matched = dict()

        task_per_taskid = dict()

        input_file_data_per_task_id = dict()
        output_file_data_per_task_id = dict()
        for task in tasks:
            task_id = mmh3.hash64("task:{}".format(str(task['name']).strip()))[0]
            print(task_id)
            task_files = task['files'] if 'files' in task else []
            task_type = task['type']
            task_cores = task['cores'] if 'cores' in task else 1
            task_memory = task['memory'] if 'memory' in task else -1
            task_runtime = task['runtime'] * 1000 if 'runtime' in task else -1
            task_dependencies = [mmh3.hash64("task:{}".format(str(p).strip()))[0] for p in
                                 task['parents']]
            task_parameters = {"arguments": task['arguments']} if 'arguments' in task else {}
            task_machine = mmh3.hash64("machine:{0}".format(task['machine'].strip()))[0] if 'machine' in task else None
            task_resource = resource_by_id[task_machine].id if 'machine' in task else -1
            # Convert energy to Wh from KWh
            task_total_energy_consumption = float(task['energy']) * 1000 if 'energy' in task else -1

            t = Task(task_id, task_type, ts_submit, -1, task_runtime, task_cores, task_dependencies, 0, -1,
                     params=task_parameters, resource=task_resource, energy_consumption=task_total_energy_consumption,
                     resource_type="core")

            task_per_taskid[task_id] = t
            task_list.append(t)

            # Parse the data transfers
            for file_item in task_files:
                # Apparently not all traces were parsed into version 0.2 despite them being in the
                # folders for 0.2. To this end we need a check for the file name and size fields.
                file_name = file_item['name'] if 'name' in file_item else file_item['fileId']
                file_size = file_item['size'] if 'size' in file_item else -1

                # Store the incoming and outgoing data to this task in separate dicts
                if file_item['link'] == "input":
                    if task_id not in inputs_per_taskid:
                        inputs_per_taskid[task_id] = set()
                        input_file_data_per_task_id[task_id] = dict()
                        inputs_per_taskid[task_id].add(file_name)

                    try:
                        input_file_data_per_task_id[task_id][file_name] = file_size
                    except:
                        print(file_item)
                        exit(-1)

                elif file_item['link'] == "output":
                    if task_id not in outputs_per_taskid:
                        outputs_per_taskid[task_id] = set()
                        outputs_matched[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()

                    outputs_per_taskid[task_id].add(file_name)
                    outputs_matched[task_id][file_name] = False
                    output_file_data_per_task_id[task_id][file_name] = file_size

            # Create a task state for the entire duration with
            task_state = TaskState(ts_submit, ts_submit + task_runtime, 0, task_id, -1,
                                   canonical_memory_usage=task_memory)
            task_state_list.append(task_state)

        # Make sure the earliest task starts at 0.
        min_ts_submit = min(task.ts_submit for task in task_list)
        for task in task_list:
            # Update the time
            task.ts_submit -= min_ts_submit
            for parent in task.parents:  # Also since we have all parent info, set them in the same loop
                task_per_taskid[parent].children.add(task.id)

        # Offset task states too
        for taskstate in task_state_list:
            taskstate.ts_start -= min_ts_submit
            taskstate.ts_end -= min_ts_submit

        data_transfer_id = 0
        # Since tasks can output files with the same name as other tasks, we must loop over a task's parents
        # and match the output names against input names.
        for task in task_list:  # For every task we have
            if task.id not in inputs_per_taskid: continue
            inputs = inputs_per_taskid[task.id]
            # We loop over the parents (no need to check children, they will come later)
            for dep in task.parents:
                outputs = outputs_per_taskid[dep] if dep in outputs_per_taskid else set()
                overlap = inputs.intersection(outputs)  # Check for overlap
                if len(overlap) > 0:  # We have input-output pairs, loop to construct datatransfers
                    for file_name in overlap:
                        # Get the size and construct a datatransfer object.
                        data_size = output_file_data_per_task_id[dep][file_name]
                        datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id,
                                                    data_size)
                        # Assign it to the tasks
                        task_per_taskid[dep].datatransfers.append(datatransfer)
                        task.datatransfers.append(datatransfer)
                        outputs_matched[dep][file_name] = True

                        # Remove the file from the input as it's covered. Do NOT remove it from output,
                        # the same output file may be used by another task (fan-out structure).
                        inputs.remove(file_name)
                        data_transfer_id += 1

            # Loop over the remaining input files. Since we do not have a source, we assume them are present
            # on the filesystem beforehand.
            for file_name in inputs:
                data_size = input_file_data_per_task_id[task.id][file_name]
                datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                task.datatransfers.append(datatransfer)
                data_transfer_id += 1

        # Loop over the outputs and create a datatransfer for those that are not matched yet
        # These are likely files with final results, not having an destination.
        for task_id in outputs_matched.keys():
            for file_name in outputs_matched[task_id].keys():
                if not outputs_matched[task_id][file_name]:
                    task = task_per_taskid[task_id]
                    data_size = output_file_data_per_task_id[task_id][file_name]
                    datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                    task.datatransfers.append(datatransfer)

        filename_for_this_partition = "part.0.parquet"

        # Write all tasks to parquet
        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
        task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])
        task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), filename_for_this_partition), engine="pyarrow")

        # Write all task states to parquet
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True)
        task_state_df = pd.DataFrame([task_state.get_parquet_dict() for task_state in task_state_list])
        task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), filename_for_this_partition),
                                 engine="pyarrow")

        # Write all data transfers to parquet
        if any(len(task.datatransfers) for task in task_list):
            os.makedirs(os.path.join(TARGET_DIR, Datatransfer.output_path()), exist_ok=True)
            datatransfer_df = pd.DataFrame(
                [datatransfer.get_parquet_dict() for task_item in task_list for datatransfer in
                 task_item.datatransfers])

            datatransfer_df.to_parquet(
                os.path.join(TARGET_DIR, Datatransfer.output_path(), filename_for_this_partition),
                engine="pyarrow")

        # Write the workflows to parquet
        wf_agnostic_df = compute_characteristics(task_df)
        workflow_ts_submit = task_df["ts_submit"].min()

        # Determine the application name and field
        application_names = {
            "epigenomics": ("Epigenomics", "Bioinformatics"),
            "montage": ("Montage", "Astronomy"),
            "soykb": ("SoyKB", "Bioinformatics"),
        }

        application_name = ""
        application_field = ""
        for key in application_names.keys():
            if key in file_path:
                application_name = application_names[key][0]
                application_field = application_names[key][1]

        workflow = Workflow(0, workflow_ts_submit, task_list, "Pegasus", "Scientific", application_name,
                            application_field)
        workflow.compute_critical_path()

        wf_df = pd.DataFrame([workflow.get_parquet_dict()])

        return wf_df
Ejemplo n.º 7
0
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1])

    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path to spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        print("######\nStart parsing Tasks\n######")
        task_df = spark.read.format('com.databricks.spark.csv').options(
            header='true', inferschema='true').load(
                os.path.join(path_to_dir, '*.csv.processed'))

        # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable
        task_df = task_df.drop('pref').filter(
            task_df.status == ":instance.status/success").drop(
                'status').cache()

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def sub_two_datetimes(s1, s2):
            arr = []
            for i in s1.keys():
                d1 = datetime.datetime.strptime(s1[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                d2 = datetime.datetime.strptime(s2[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                arr.append(int((d2 - d1).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df \
            .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \
            .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time')))

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def date_time_to_unix(series):
            arr = []
            epoch = datetime.datetime.utcfromtimestamp(0)
            for i in series.keys():
                arr.append(
                    np.int64((datetime.datetime.strptime(
                        series[i], '%a %b %d %H:%M:%S %Z %Y') -
                              epoch).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df.withColumn(
            'submit-time',
            date_time_to_unix(F.col('submit-time'))).withColumnRenamed(
                'submit-time',
                "ts_submit").drop('start-time').drop('end-time').cache()

        min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0]
        task_df = task_df.withColumn('ts_submit',
                                     F.col('ts_submit') - F.lit(min_ts))

        @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR)
        def convert_to_kb(v):
            return v * 1024

        task_df = task_df.withColumn('memory', convert_to_kb(
            task_df.memory)).withColumnRenamed("memory", "memory_consumption")

        @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR)
        def string_to_int(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash(v[i], signed=True))

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def string_to_long(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash64(v[i], signed=True)[0])

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def assign_workflow_ids(v):
            arr = []
            for i in v.keys():
                if v[i]:
                    arr.append(mmh3.hash64(v[i], signed=True)[0])
                else:
                    arr.append(
                        mmh3.hash64(uuid4().bytes, signed=True)
                        [0])  # Assign a UUID, collision chance is negligible.

            return pd.Series(arr)

        task_df = task_df.withColumn('user', string_to_int(
            task_df.user)).withColumnRenamed("user", "user_id")
        task_df = task_df.withColumn('job-uuid',
                                     string_to_long(
                                         F.col('job-uuid'))).withColumnRenamed(
                                             'job-uuid', 'task_id')

        type_udf = F.udf(lambda x: "Independent" if x is None else "Composite",
                         T.StringType())
        task_df = task_df.withColumn('type', type_udf(task_df.simset))

        task_df = task_df.withColumn('simset',
                                     assign_workflow_ids(
                                         F.col('simset'))).withColumnRenamed(
                                             'simset', "workflow_id")
        task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested')

        task_df = task_df.withColumnRenamed('instance', 'resource_used')

        # Set the static items that are not present in the trace
        task_df = task_df.withColumn('submission_site', F.lit(0))
        task_df = task_df.withColumn('parents',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('children',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('group_id', F.lit(0))
        task_df = task_df.withColumn('nfrs', F.lit("{}"))
        task_df = task_df.withColumn('params', F.lit("{}"))
        task_df = task_df.withColumn('memory_requested', F.lit(-1))
        task_df = task_df.withColumn('network_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_space_requested', F.lit(-1))
        task_df = task_df.withColumn('energy_consumption', F.lit(-1))

        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()),
                    exist_ok=True)
        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()),
                              mode="overwrite",
                              compression="snappy")
        print("######\nDone parsing Tasks\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\nStart parsing TaskState\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        task_state_structtype = T.StructType([
            T.StructField("ts_start", T.LongType(), False),
            T.StructField("ts_end", T.LongType(), False),
            T.StructField("workflow_id", T.LongType(), False),
            T.StructField("task_id", T.LongType(), False),
            T.StructField("resource_id", T.LongType(), False),
            T.StructField("cpu_rate", T.DoubleType(), False),
            T.StructField("canonical_memory_usage", T.DoubleType(), False),
            T.StructField("assigned_memory", T.DoubleType(), False),
            T.StructField("minimum_memory_usage", T.DoubleType(), False),
            T.StructField("maximum_memory_usage", T.DoubleType(), False),
            T.StructField("disk_io_time", T.DoubleType(), False),
            T.StructField("maximum_disk_bandwidth", T.DoubleType(), False),
            T.StructField("local_disk_space_usage", T.DoubleType(), False),
            T.StructField("maximum_cpu_rate", T.DoubleType(), False),
            T.StructField("maximum_disk_io_time", T.DoubleType(), False),
            T.StructField("sample_rate", T.DoubleType(), False),
            T.StructField("sample_portion", T.DoubleType(), False),
            T.StructField("sampled_cpu_usage", T.DoubleType(), False),
            T.StructField("network_io_time", T.DoubleType(), False),
            T.StructField("maximum_network_bandwidth", T.DoubleType(), False),
        ])

        @F.pandas_udf(returnType=task_state_structtype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_task_states(df):
            workflow_id = df['workflow_id'].iloc[0]
            task_id = df['task_id'].iloc[0]
            ts_start = df['ts_submit'].min()
            ts_end = ts_start + df['runtime'].max()
            resource_id = df['resource_used'].iloc[0]
            cpu_rate = -1
            canonical_memory_usage = df['memory_consumption'].mean()
            assigned_memory = -1
            minimum_memory_usage = df['memory_consumption'].min()
            maximum_memory_usage = df['memory_consumption'].max()
            disk_io_time = -1
            maximum_disk_bandwidth = -1
            local_disk_space_usage = -1
            maximum_cpu_rate = -1
            maximum_disk_io_time = -1
            sample_rate = -1
            sample_portion = -1
            sampled_cpu_usage = -1
            network_io_time = -1
            maximum_network_bandwidth = -1

            data_dict = {
                "ts_start": ts_start,
                "ts_end": ts_end,
                "workflow_id": workflow_id,
                "task_id": task_id,
                "resource_id": resource_id,
                "cpu_rate": cpu_rate,
                "canonical_memory_usage": canonical_memory_usage,
                "assigned_memory": assigned_memory,
                "minimum_memory_usage": minimum_memory_usage,
                "maximum_memory_usage": maximum_memory_usage,
                "disk_io_time": disk_io_time,
                "maximum_disk_bandwidth": maximum_disk_bandwidth,
                "local_disk_space_usage": local_disk_space_usage,
                "maximum_cpu_rate": maximum_cpu_rate,
                "maximum_disk_io_time": maximum_disk_io_time,
                "sample_rate": sample_rate,
                "sample_portion": sample_portion,
                "sampled_cpu_usage": sampled_cpu_usage,
                "network_io_time": network_io_time,
                "maximum_network_bandwidth": maximum_network_bandwidth,
            }

            return pd.DataFrame(data_dict, index=[0])

        task_state_df = task_df.groupBy(['workflow_id',
                                         'task_id']).apply(compute_task_states)
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()),
                    exist_ok=True)
        task_state_df.write.parquet(os.path.join(TARGET_DIR,
                                                 TaskState.output_path()),
                                    mode="overwrite",
                                    compression="snappy")
        print("######\nDone parsing TaskState\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\nStart parsing Resources\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        resource_id_column = [
            i.resource_used
            for i in task_df.select('resource_used').distinct().collect()
        ]

        resources = []
        for resource_id in resource_id_column:
            resources.append(
                Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1,
                         '').get_parquet_dict())

        resource_df = pd.DataFrame(resources)
        os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()),
                    exist_ok=True)
        resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(),
                                            'part.0.parquet'),
                               engine="pyarrow")
        print("######\nDone parsing Resources\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\nStart parsing Workflows\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(),
                          False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = "Cook"
            total_resources = df['resource_amount_requested'].sum()
            total_memory_usage = df['memory_consumption'].sum()
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id,
                "ts_submit": ts_submit,
                'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks,
                'nfrs': nfrs,
                'scheduler': scheduler,
                'total_resources': total_resources,
                'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage,
                'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        workflow_df = task_df.groupBy('workflow_id').apply(
            compute_workflow_stats)
        workflow_df.explain(True)
        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")
        print("######\nDone parsing Workflows\n######")

    print("######\nStart parsing Ẁorkload\n######")
    pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR,
                                                  Task.output_path()),
                                     engine="pyarrow")
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        pandas_task_df,
        domain="Industrial",
        start_date=None,
        end_date=None,
        authors=["Two Sigma"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
def read_file(source_dir, dax_file, count, arrival_time, target_dir):
    tree = ET.parse(os.path.join(source_dir, dax_file))
    adag = tree.getroot()

    dependencies = {}

    def id_to_int(id):
        return int(non_decimal.sub('', id))

    for child in adag.findall('{http://pegasus.isi.edu/schema/DAX}child'):
        task_id = mmh3.hash64("workflow:{}_task:{}".format(count, id_to_int(child.attrib['ref'])))[0]

        if task_id not in dependencies:
            dependencies[task_id] = []

        for parent in child:
            parent_id = id_to_int(parent.attrib['ref'])
            dependencies[task_id].append(parent_id)

    tasks = adag.findall('{http://pegasus.isi.edu/schema/DAX}job')
    task_list = []

    inputs_per_taskid = dict()  # Contains input file names per task id
    outputs_per_taskid = dict()  # Contains output file names per task id
    outputs_matched = dict()  # A dictionary to check of outputs have been matched with an input

    input_file_data_per_task_id = dict()  # Dict with file sizes per file name for input files
    output_file_data_per_task_id = dict()  # Dict with file sizes per file name for output files

    task_per_taskid = dict()

    for task in tasks:
        # Since all tasks start at 0, in every file, we add the workflow id and hash it to get task id
        task_id = mmh3.hash64("workflow:{}_task:{}".format(count, id_to_int(task.attrib['id'])))[0]
        # Ilyushkin et al. added an attribute field called profile, containing the runtime of that particular task.
        runtime = int(float(task.attrib['runtime']) * 1000)

        if task_id in dependencies:
            task_dependencies = [mmh3.hash64("workflow:{}_task:{}".format(count, dependency))[0] for dependency in
                                 dependencies[task_id]]
        else:
            task_dependencies = []

        # Site ids are unknown, all tasks in the work to use 1 core.
        t = Task(task_id, "", arrival_time, 0, runtime, 1, task_dependencies, count, -1, resource_type="core",
                 resource=-1)
        task_list.append(t)
        task_per_taskid[task_id] = t

        # Parse the data transfers
        for data_item in task.findall("{http://pegasus.isi.edu/schema/DAX}uses"):
            # Store the incoming and outgoing data to this task in separate dicts
            if data_item.attrib['link'] == "input":
                if task_id not in inputs_per_taskid:
                    inputs_per_taskid[task_id] = set()
                    input_file_data_per_task_id[task_id] = dict()

                inputs_per_taskid[task_id].add(data_item.attrib['file'])
                input_file_data_per_task_id[task_id][data_item.attrib['file']] = data_item.attrib['size']

            elif data_item.attrib['link'] == "output":
                if task_id not in outputs_per_taskid:
                    outputs_per_taskid[task_id] = set()
                    outputs_matched[task_id] = dict()
                    output_file_data_per_task_id[task_id] = dict()
                    output_file_data_per_task_id[task_id] = dict()

                outputs_per_taskid[task_id].add(data_item.attrib['file'])
                outputs_matched[task_id][data_item.attrib['file']] = False
                output_file_data_per_task_id[task_id][data_item.attrib['file']] = data_item.attrib['size']

    # Set children
    for task in task_list:
        for parent_id in task.parents:
            task_per_taskid[parent_id].children.add(task.id)

    data_transfer_id = 0
    # Since tasks can output files with the same name as other tasks, we must loop over a task's parents
    # and match the output names against input names.
    for task in task_list:  # For every task we have
        inputs = inputs_per_taskid[task.id]
        for dep in task.parents:  # We loop over the parents (no need to check children, they will come later)
            outputs = outputs_per_taskid[dep]
            overlap = inputs.intersection(outputs)  # Check for overlap
            if len(overlap) > 0:  # We have input - output pairs, loop over them to construct datatransfers
                for file_name in overlap:
                    # Get the size and construct a datatransfer object.
                    data_size = output_file_data_per_task_id[dep][file_name]
                    datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id, data_size)

                    # Assign it to the tasks
                    task_per_taskid[dep].datatransfers.append(datatransfer)
                    task.datatransfers.append(datatransfer)
                    outputs_matched[dep][file_name] = True

                    # Remove the file from the input as it's covered. Do NOT remove it from output, the same output
                    # file may be used by another task (fan-out structure).
                    inputs.remove(file_name)
                    data_transfer_id += 1

        # Now, loop over the remaining input files. Since we do not have a source, we assume them are present
        # on the filesystem beforehand.

        for file_name in inputs:
            data_size = input_file_data_per_task_id[task.id][file_name]
            datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
            task.datatransfers.append(datatransfer)
            data_transfer_id += 1

    # Loop over the outputs and create a datatransfer for those that are not matched yet
    # These are likely files with final results, not having an destination.
    for task_id in outputs_matched.keys():
        for file_name in outputs_matched[task_id].keys():
            if not outputs_matched[task_id][file_name]:
                task = task_per_taskid[task_id]
                data_size = output_file_data_per_task_id[task_id][file_name]
                datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                task.datatransfers.append(datatransfer)

    filename_for_this_partition = "part.{}.parquet".format(count)

    os.makedirs(os.path.join(target_dir, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])

    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    # Make sure the columns are in the right order
    task_df = task_df[sorted(task_df.columns)]

    task_df.to_parquet(os.path.join(target_dir, Task.output_path(), filename_for_this_partition), engine="pyarrow")
    os.makedirs(os.path.join(target_dir, Datatransfer.output_path()), exist_ok=True)
    datatransfer_df = pd.DataFrame(
        [datatransfer.get_parquet_dict() for task in task_list for datatransfer in task.datatransfers])
    datatransfer_df.to_parquet(os.path.join(target_dir, Datatransfer.output_path(), filename_for_this_partition),
                               engine="pyarrow")

    wf_agnostic_df = compute_characteristics(task_df)
    workflow_ts_submit = task_df["ts_submit"].min()

    application_names = {
        "_lig": ("LIGO", "Physics"),
        "_sip": ("SIPHT", "Bioinformatics"),
        "_mon": ("Montage", "Astronomy"),
    }
    application_name = ""
    application_field = ""
    for key in application_names.keys():
        if key in dax_file:
            application_name = application_names[key][0]
            application_field = application_names[key][1]
            break

    workflow = Workflow(count, workflow_ts_submit, task_list, "", "Scientific", application_name, application_field)
    workflow.compute_critical_path()

    wf_df = pd.concat([wf_agnostic_df, pd.DataFrame(
        {"id": pd.Series([np.int64(count)], dtype=np.int64),
         "ts_submit": pd.Series([np.int64(workflow_ts_submit)], dtype=np.int64),
         "critical_path_length": pd.Series([np.int32(workflow.critical_path_length)], dtype=np.int32),
         "critical_path_task_count": pd.Series([np.int32(workflow.critical_path_task_count)], dtype=np.int32),
         "approx_max_concurrent_tasks": pd.Series([np.int32(workflow.max_concurrent_tasks)], dtype=np.int32),
         "scheduler": pd.Series([np.str("")], dtype=np.str),
         })], axis=1)

    wf_df["nfrs"] = np.str("{}")

    wf_df = wf_df[sorted(wf_df.columns)]

    os.makedirs(os.path.join(target_dir, Workflow.output_path()), exist_ok=True)
    wf_df.to_parquet(os.path.join(target_dir, Workflow.output_path(), filename_for_this_partition), engine="pyarrow")
Ejemplo n.º 9
0
def parse_lanl_file(lanl_file):
    task_list = []
    task_by_id = {}

    df = pd.read_csv(lanl_file,
                     parse_dates=["submission_time", "start_date", "end_date"],
                     infer_datetime_format=True)
    task_df = df[df['object_event'] == "JOBEND"]

    earliest_date = df['submission_time'].min()
    latest_date = df["end_date"].max()

    for index, row in task_df.iterrows():
        id = str(
            mmh3.hash64("task:{}".format(str(row["object_id"]).strip()))[0])

        # Task time fields
        submission_time_task = row["submission_time"]
        start_time_task = row["start_date"]
        end_time_task = row["end_date"]

        # Task cpu consumption fields
        num_nodes = int(row["nodes_requested"])
        num_cpus_per_node = row["dedicated_processors_per_task"]

        # Task dependency fields
        extension_string = str(row["resource_manager_extension_string"])
        # Find dependencies
        match = re.search('DEPEND=([\w,:.]+);?', extension_string)
        if not match:
            dependencies = set()
        else:
            dependencies = match.group(1)
            dependencies = set(
                str(mmh3.hash64("task:{}".format(str(dep).strip()))[0])
                for dep in dependencies.split("&"))

        task_wait_time = int(
            (start_time_task - submission_time_task).total_seconds() * 1000)
        task_runtime = int(
            (end_time_task - start_time_task).total_seconds() * 1000)

        task = Task(id,
                    "Atomic",
                    start_time_task,
                    -1,
                    task_runtime,
                    num_nodes * num_cpus_per_node,
                    dependencies,
                    -1,
                    task_wait_time,
                    resource_type="core",
                    resource=-1)

        # Convert the ts_submit to seconds instead of a datetime string
        EPOCH = datetime(1970, 1, 1, tzinfo=task.ts_submit.tzinfo)
        task.ts_submit = int((task.ts_submit - EPOCH).total_seconds() * 1000)

        # Set the wallclock limit
        task.nfrs["runtime_limit"] = row["wallclock_limit"]

        task_by_id[id] = task
        task_list.append(task)

    min_ts_submit = min(task.ts_submit for task in task_list)

    # For every task, add itself the the children of its parents
    for task in task_list:
        task.ts_submit -= min_ts_submit  # Make sure the first task in the trace starts at 0
        invalid_parents = set()
        for parent_id in task.parents:
            # Chop of the prepend of *: (e.g. jobsuccess:)
            actual_parent_id = parent_id[str(parent_id).find(":") + 1:]
            if actual_parent_id in task_by_id:  # If this doesn't fire, the task probably failed, we filter those out.
                parent = task_by_id[actual_parent_id]
                parent.children.add(task.id)
            else:
                invalid_parents.add(parent_id)

        # Remove invalid parents
        if invalid_parents:
            task.parents -= invalid_parents

    # Find start tasks and assign workflow ids
    workflow_id = 0
    for task in task_list:
        if task.workflow_id == -1:
            root_parents = task.get_root_parents(task_by_id)
            if root_parents:  # If there are start tasks, propogate from them
                for root_parent_id in root_parents:
                    actual_root_id = root_parent_id[str(root_parent_id).
                                                    find(":") + 1:]
                    task_by_id[actual_root_id].set_workflow_id_propagating(
                        task_by_id, workflow_id)
            else:  # Else it's a single job so just set the property directly
                task.workflow_id = workflow_id
            workflow_id += 1

    # Now that every thing has been computed, we write the tasks to parquet files
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])

    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(
        ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(),
                                    "part.0.parquet"),
                       engine="pyarrow")

    workflows = dict()
    # Based on workflow ids, constuct the workflow objects
    for task in task_list:
        if task.workflow_id in workflows:
            workflow = workflows[task.workflow_id]
        else:
            workflow = Workflow(workflow_id, None, [], "", "Scientific",
                                "Uncategorized", "Uncategorized")
            workflows[task.workflow_id] = workflow

        if not workflow.ts_submit:
            workflow.ts_submit = task.ts_submit
        else:
            workflow.ts_submit = min(workflow.ts_submit, task.ts_submit)

        workflow.tasks.append(task)
        workflow.task_count = len(workflow.tasks)

    for w in workflows.values():
        w.compute_critical_path(strip_colon=True)

    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df = pd.DataFrame(
        [workflow.get_parquet_dict() for workflow in workflows.values()])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain="Engineering",
        start_date=str(earliest_date),
        end_date=str(latest_date),
        authors=[
            "George Amvrosiadis", "Jun Woo Park", "Gregory R. Ganger",
            "Garth A. Gibson", "Elisabeth Baseman", "Nathan DeBardeleben"
        ],
        workload_description=
        "This workload was published by Amvrosiadis et al. as part of their ATC 2018 paper titled \"On the diversity of cluster workloads and its impact on research results\". It is the Trinity trace from the Los Almos National Laboratory."
    )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
def parse_askalon_file(askalon_file):
    if not os.path.exists(TARGET_DIR):
        os.makedirs(TARGET_DIR)

    workflows = []
    with open(askalon_file, 'r') as asklon_trace:
        data = json.load(asklon_trace)
        for wf in data:
            workflows.append(parse_workflow(wf, askalon_file))

    for w in workflows:
        w.compute_critical_path()

    # Write the workflow objects to parquet
    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df = pd.DataFrame(
        [workflow.get_parquet_dict() for workflow in workflows])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    # Write all tasks to parquet
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame(
        [task.get_parquet_dict() for wf in workflows for task in wf.tasks])
    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(
        ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    pyarrow_task_schema = Task.get_pyarrow_schema()
    table = pa.Table.from_pandas(task_df,
                                 schema=pyarrow_task_schema,
                                 preserve_index=False)

    # Pandas does not know the different between an empty list and a list with integers
    # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly
    # using a schema.
    pq.write_table(
        table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"))

    # generate workload description
    authors_list = ["Roland Matha", "Radu Prodan"]

    # generate workload description
    workload_description = ""
    if "bwa" in askalon_file.lower():
        workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome."
    elif "wien2k" in askalon_file.lower():
        workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids."

    workload_domain = "Scientific"

    w = Workload(workflows, workload_domain, authors_list,
                 workload_description)

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain=workload_domain,
        authors=authors_list,
        workload_description=workload_description)

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[-1])

    if "DAS5" in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ["HOSTNAME"] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "256G") \
            .config("spark.driver.maxResultSize", "40G") \
            .config("spark.network.timeout", "100000s") \
            .config("spark.rpc.askTimeout", "100000s") \
            .config("spark.default.parallelism", "2000") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .config("spark.cleaner.periodicGC.interval", "5s") \
            .getOrCreate()
    else:
        import findspark
        findspark.init("<path_to_spark>")
        spark = SparkSession.builder \
            .master("local[4]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "2G") \
            .config("spark.driver.memory", "2G") \
            .getOrCreate()

    machine_meta = spark.read.csv(os.path.join(path_to_dir,
                                               "machine_meta.csv"),
                                  schema=StructType([
                                      StructField("machine_id", StringType(),
                                                  True),
                                      StructField("time_stamp", LongType(),
                                                  True),
                                      StructField("failure_domain_1",
                                                  LongType(), True),
                                      StructField("failure_domain_2",
                                                  StringType(), True),
                                      StructField("cpu_num", LongType(), True),
                                      StructField("mem_size", LongType(),
                                                  True),
                                      StructField("status", StringType(), True)
                                  ]))

    machine_usage = spark.read.csv(os.path.join(path_to_dir,
                                                "machine_usage.csv"),
                                   schema=StructType([
                                       StructField("machine_id", StringType(),
                                                   True),
                                       StructField("time_stamp", DoubleType(),
                                                   True),
                                       StructField("cpu_util_percent",
                                                   LongType(), True),
                                       StructField("mem_util_percent",
                                                   LongType(), True),
                                       StructField("mem_gps", DoubleType(),
                                                   True),
                                       StructField("mkpi", LongType(), True),
                                       StructField("net_in", DoubleType(),
                                                   True),
                                       StructField("net_out", DoubleType(),
                                                   True),
                                       StructField("disk_io_percent",
                                                   DoubleType(), True)
                                   ]))

    container_meta = spark.read.csv(
        os.path.join(path_to_dir, "container_meta.csv"),
        schema=StructType([
            StructField("container_id", StringType(), True),
            StructField("machine_id", StringType(), True),
            StructField("time_stamp", LongType(), True),
            StructField("app_du", StringType(), True),
            StructField("status", StringType(), True),
            StructField("cpu_request", LongType(), True),
            StructField("cpu_limit", LongType(), True),
            StructField("mem_size", DoubleType(), True)
        ]))

    container_usage = spark.read.csv(os.path.join(path_to_dir,
                                                  "container_usage.csv"),
                                     schema=StructType([
                                         StructField("container_id",
                                                     StringType(), True),
                                         StructField("machine_id",
                                                     StringType(), True),
                                         StructField("time_stamp",
                                                     DoubleType(), True),
                                         StructField("cpu_util_percent",
                                                     LongType(), True),
                                         StructField("mem_util_percent",
                                                     LongType(), True),
                                         StructField("cpi", DoubleType(),
                                                     True),
                                         StructField("mem_gps", DoubleType(),
                                                     True),
                                         StructField("mpki", LongType(), True),
                                         StructField("net_in", DoubleType(),
                                                     True),
                                         StructField("net_out", DoubleType(),
                                                     True),
                                         StructField("disk_io_percent",
                                                     DoubleType(), True)
                                     ]))

    batch_task = spark.read.csv(os.path.join(path_to_dir, "batch_task.csv"),
                                schema=StructType([
                                    StructField("task_name", StringType(),
                                                True),
                                    StructField("instance_num", LongType(),
                                                True),
                                    StructField("job_name", StringType(),
                                                True),
                                    StructField("task_type", StringType(),
                                                True),
                                    StructField("status", StringType(), True),
                                    StructField("start_time", LongType(),
                                                True),
                                    StructField("end_time", LongType(), True),
                                    StructField("plan_cpu", DoubleType(),
                                                True),
                                    StructField("plan_mem", DoubleType(), True)
                                ]))

    batch_instance = spark.read.csv(
        os.path.join(path_to_dir, "batch_instance.csv"),
        schema=StructType([
            StructField("instance_name", StringType(), True),
            StructField("task_name", StringType(), True),
            StructField("job_name", StringType(), True),
            StructField("task_type", StringType(), True),
            StructField("status", StringType(), True),
            StructField("start_time", LongType(), True),
            StructField("end_time", LongType(), True),
            StructField("machine_id", StringType(), True),
            StructField("seq_no", LongType(), True),
            StructField("total_seq_no", LongType(), True),
            StructField("cpu_avg", DoubleType(), True),
            StructField("cpu_max", DoubleType(), True),
            StructField("mem_avg", DoubleType(), True),
            StructField("mem_max", DoubleType(), True)
        ]))

    @F.pandas_udf(returnType=Task.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def clean_tasks_of_workflow(df):
        tasks = dict()
        raw_id_to_instances = dict()

        job_name = df.loc[0, "job_name"]
        workflow_id = mmh3.hash64(job_name)[1]

        invalid_task_raw_ids = set()

        # group by task name
        # - count number of instances
        # - compare with row.instance_num

        # Check to inspect if the data is noisy
        # def check(pdf):
        #     a = pdf["instance_name"].nunique()
        #     b = pdf["instance_name"].astype(np.int64).min()
        #     c = pdf["instance_name"].astype(np.int64).max()
        #     d = pdf["instance_num"].min()
        #     e = pdf["instance_num"].max()
        #     f = pdf["instance_name"].count()
        #     if d != e or b < 0 or c >= e or a != d or a != f:
        #         print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f))
        #
        # df.groupby("task_name").apply(check)

        for row in df.itertuples(index=False):
            if None in row:
                print(row, flush=True)
            task_name = row.task_name
            instance_name = str(row.instance_name)
            memory_requested = row.plan_mem
            resources_requested = row.plan_cpu
            resource_id = row.machine_id

            splits = task_name.split("_")

            if splits[0] == "task":
                cleaned_task_name = splits[1]
                task_type = "bag"
                raw_parents = []
            else:
                cleaned_task_name = splits[0][1:]
                task_type = str(splits[0][0])
                raw_parents = [x for x in splits[1:] if x.isdigit()]

            if resource_id is None:
                resource_id = -1
            else:
                resource_id = mmh3.hash64(row.machine_id)[1]

            if row.end_time is None or math.isnan(row.end_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if row.start_time is None or math.isnan(row.start_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if memory_requested is None or math.isnan(memory_requested):
                memory_requested = -1

            if resources_requested is None or math.isnan(resources_requested):
                avg_cpu = row.cpu_avg
                if avg_cpu is None or math.isnan(avg_cpu):
                    invalid_task_raw_ids.add(cleaned_task_name)
                    continue
                else:
                    resources_requested = avg_cpu

            this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name +
                                       "@" + instance_name)[1]

            if cleaned_task_name not in raw_id_to_instances:
                raw_id_to_instances[cleaned_task_name] = row.instance_num

            if row.instance_num > 10:
                # Create parent and child tasks
                raw_parent_id = cleaned_task_name + "_p"
                parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id +
                                             "@" + "0")[1]
                if parent_task_id not in tasks:
                    tasks[parent_task_id] = Task(
                        id=parent_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=raw_parents,
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1)
                    raw_id_to_instances[raw_parent_id] = 1

                raw_child_id = cleaned_task_name + "_c"
                child_task_id = mmh3.hash64(job_name + "@" + raw_child_id +
                                            "@" + "0")[1]
                if child_task_id not in tasks:
                    tasks[child_task_id] = Task(
                        id=child_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=[cleaned_task_name],
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1,
                        params="child")
                    raw_id_to_instances[raw_child_id] = 1

                raw_parents = [raw_parent_id]

            this_task = Task(
                id=this_task_id,
                type=task_type,
                submission_site=0,
                runtime=(row.end_time - row.start_time) * 1000,
                ts_submit=row.start_time *
                1000,  # We convert time from seconds to milliseconds.
                resource_amount_requested=resources_requested,
                parents=raw_parents,
                workflow_id=workflow_id,
                params=task_name + " $ " + instance_name + " $ " +
                str(row.instance_num) + " $ " + job_name,
                wait_time=0,
                resource_type='core',
                resource=resource_id,
                memory_requested=memory_requested)

            tasks[this_task_id] = this_task

        for task_id, task in tasks.items():
            task.parents = [
                p for p in task.parents if p not in invalid_task_raw_ids
            ]
            parents = []
            for raw_parent_id in task.parents:
                # If previous wave has a child and this task is not that child.
                # refer to the child instead of the wave.
                if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child":
                    raw_parent_id = raw_parent_id + "_c"

                # We might hit an edge case where a parent was not recorded by the system of Alibaba
                # (e.g. bug or the tracing stopped)
                if raw_parent_id not in raw_id_to_instances:
                    continue

                parent_instances = raw_id_to_instances[raw_parent_id]

                proper_parent_ids = []
                for x in range(parent_instances):
                    # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data
                    # So we need to check if they are actually encountered.
                    hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" +
                                       str(x))[1]
                    if hash in tasks:
                        proper_parent_ids.append(hash)

                parents.extend(proper_parent_ids)
                for proper_id in proper_parent_ids:
                    tasks[proper_id].children.add(task_id)

            # task.params = None
            task.parents = parents

        # ze_best = pd.concat(pandas_dataframes)
        parquet_dicts = [task.get_parquet_dict() for task in tasks.values()]
        if len(tasks) > 0:
            ret = pd.DataFrame(parquet_dicts)
        else:  # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom.
            ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys())
        return ret

    @F.pandas_udf(returnType=Task.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def container_to_task(df):
        row = df.iloc[0, :]
        start_time = df["time_stamp"].min() * 1000
        stop_time = df["time_stamp"].max() * 1000
        task_id = mmh3.hash64(row["container_id"])[1]
        workflow_id = mmh3.hash64(row["app_du"])[1]

        task = Task(
            id=task_id,
            type="long running",
            parents=[],
            ts_submit=
            start_time,  # We convert time from seconds to milliseconds.
            submission_site=0,
            runtime=(start_time - stop_time),
            resource_amount_requested=row["cpu_request"],
            memory_requested=row["mem_size"],
            workflow_id=workflow_id,
            wait_time=0,
            resource=mmh3.hash64(row["machine_id"])[1])

        return pd.DataFrame([task.get_parquet_dict()])

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        # Rename instances
        # This allows instance names to be derived using just the task name and number of instances of the task.
        task_window = Window.partitionBy("job_name",
                                         "task_name").orderBy("start_time")
        # Subtracting 1 becasue row number starts at 0. Makes later iteration more intuitive.
        # We are using instance name as an index in a particular job and task.
        instances_renamed = batch_instance.withColumn(
            "instance_name",
            (F.row_number().over(task_window) - F.lit(1)).cast(StringType()))

        tasks_unconverted = instances_renamed.join(
            batch_task.select("job_name", "task_name", "instance_num",
                              "plan_cpu", "plan_mem"),
            on=["job_name", "task_name"],
            how="inner")

        # 100% this line is the issue.
        tasks_converted = tasks_unconverted.groupby("job_name").apply(
            clean_tasks_of_workflow)

        # if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        #     tasks_converted.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite")

        long_running_tasks = container_meta.groupBy("container_id").apply(
            container_to_task)

        all_tasks = tasks_converted.union(long_running_tasks).dropna()

        try:
            all_tasks.printSchema()
            all_tasks.write.parquet(os.path.join(TARGET_DIR,
                                                 Task.output_path()),
                                    mode="overwrite")
        except Exception as e:
            print(e, flush=True)
            raise e

    @F.pandas_udf(returnType=TaskState.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def task_states_from_instances(df):
        task_states = []

        workflow_id = mmh3.hash64(df.loc[0, "job_name"])[1]

        for index, row in df.iterrows():
            job_name = row["job_name"]
            task_name = row["task_name"]
            instance_name = row["instance_name"]

            splits = task_name.split("_")
            just_task_name = splits[0][
                1:]  # The first letter is irrelevant as it corresponds to nature of task (map or reduce)
            # and has nothing to do with the structure of the workflow.

            this_task_id = mmh3.hash64(job_name + "@" + just_task_name + "@" +
                                       instance_name)[1]

            this_task_state = TaskState(ts_start=row["start_time"] * 1000,
                                        ts_end=row["end_time"] * 1000,
                                        workflow_id=workflow_id,
                                        task_id=this_task_id,
                                        resource_id=mmh3.hash64(
                                            row["machine_id"])[1],
                                        cpu_rate=row["cpu_avg"],
                                        canonical_memory_usage=row["mem_avg"],
                                        maximum_cpu_rate=row["cpu_max"],
                                        maximum_memory_usage=row["mem_max"])

            if None in this_task_state.get_parquet_dict().values() or np.isnan(
                    this_task_state.get_parquet_dict().values()):
                print(this_task_state.get_parquet_dict())
                raise RuntimeError(this_task_state.get_parquet_dict())
            task_states.append(this_task_state.get_parquet_dict())

        return pd.DataFrame(task_states)

    @F.pandas_udf(returnType=TaskState.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def task_states_from_container_usage(df):
        machine_id = mmh3.hash64(df.loc[0, "machine_id"])[1]

        def convert(cont_df):
            task_states = []

            prev_end_time = cont_df.loc[0, "start_time"] * 1000
            container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1]
            app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1]

            sorted_df = df.sort_values("time_stamp")
            for index, row in sorted_df.iterrows():
                this_end_time = row["time_stamp"] * 1000

                this_task_state = TaskState(
                    ts_start=prev_end_time,
                    ts_end=this_end_time,
                    workflow_id=app_id,
                    task_id=container_id,
                    resource_id=machine_id,
                    cpu_rate=row["cpu_util_percent"],
                    canonical_memory_usage=row["mem_util_percent"],
                    maximum_disk_bandwidth=row["disk_io_percent"],
                    network_in=row["net_in"],
                    network_out=row["net_out"])

                prev_end_time = this_end_time

                task_states.append(this_task_state.get_parquet_dict())
                if None in this_task_state.get_parquet_dict().values(
                ) or np.isnan(this_task_state.get_parquet_dict().values()):
                    print(this_task_state.get_parquet_dict())
                    raise ArithmeticError(this_task_state.get_parquet_dict())

            return pd.DataFrame(task_states)

        return df.groupby("container_id").apply(convert).reset_index(
            drop=True).fillna(-1)

    # Now, derive workflows from tasks
    @F.pandas_udf(returnType=Workflow.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def compute_workflow_stats(df):
        tasks = []

        for index, row in df.iterrows():
            this_task = Task(
                id=row["id"],
                type=row["type"],
                ts_submit=row["ts_submit"],
                # We convert time from seconds to milliseconds.
                submission_site=0,
                runtime=row["runtime"],
                resource_amount_requested=row["resource_amount_requested"],
                memory_requested=row["memory_requested"],
                parents=row["parents"],
                workflow_id=row["workflow_id"],
                wait_time=row["wait_time"],
                resource=row["resource_used"])
            # print(this_task.get_parquet_dict())
            tasks.append(this_task)

        workflow = Workflow(id=df.loc[0, "workflow_id"],
                            ts_submit=df["ts_submit"].min(),
                            tasks=tasks,
                            scheduler_description="Fuxi",
                            workflow_domain="Industrial",
                            workflow_application_name="MapReduce",
                            workflow_appliation_field="Internet Services")

        try:
            workflow.compute_critical_path()
        except toposort.CircularDependencyError:  # TODO: Some have cyclic dependencies. Check if this is us, or the data (again)
            pass

        return pd.DataFrame([workflow.get_parquet_dict()])

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        tasks_df = spark.read.parquet(
            os.path.join(TARGET_DIR, Task.output_path())
        )  # Spark doesn't understand it can now read from files, so tell him
        workflow_df = tasks_df.groupBy("workflow_id").apply(
            compute_workflow_stats)

        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")

    def machine_meta_to_resources(row):
        resource = Resource(
            id=mmh3.hash64(row["machine_id"])[1],
            type="cpu",
            num_resources=float(row["cpu_num"]),
            memory=row["mem_size"],
        )
        resource_dict = resource.get_json_dict()
        del resource_dict["events"]
        return SparkRow(**resource_dict)

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\n Start parsing Resource DF\n ######")
        resource_df = machine_meta.rdd.map(machine_meta_to_resources).toDF(
            Resource.get_spark_type())
        resource_df.write.parquet(os.path.join(TARGET_DIR,
                                               Resource.output_path()),
                                  mode="overwrite",
                                  compression="snappy")

    print("######\n Start parsing Workload\n ######")
    if "tasks_df" not in locals():
        tasks_df = spark.read.parquet(
            os.path.join(TARGET_DIR, Task.output_path())
        )  # Spark doesn't understand it can now read from parquet files, so tell him
    json_dict = Workload.get_json_dict_from_spark_task_dataframe(
        tasks_df, domain="Industrial", authors=["Alibaba 2018"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)
    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
    print("######\n Done parsing Workload\n ######")
Ejemplo n.º 12
0
def parse_askalon_file(askalon_file):
    os.makedirs(TARGET_DIR, exist_ok=True)

    workflow_index = 0
    invalid_workflow_count = 0
    workflow_start = None
    invalid_workflow = False

    final_task_list = []
    final_taskstate_list = []
    final_workflow_list = []

    tasks = []
    task_by_id = dict()
    task_state_list = []
    with open(askalon_file, 'r') as asklon_trace:
        for line in asklon_trace.readlines():
            if line.startswith('#'):
                if not line.startswith('# Started:'):
                    continue

                workflow_date = re.search('# Started: (.+),', line).group(1)

                if int(
                        workflow_date[-4:]
                ) == 1970:  # filter out "# Started: Thu Jan 01 01:00:00 CET 1970, and did not finish (yet)"
                    invalid_workflow = True
                    continue

                if workflow_date.find('CEST ') >= 0:
                    timezone_diff = 7200
                    workflow_date = workflow_date.replace('CEST ', '')
                elif workflow_date.find('CET') >= 0:
                    timezone_diff = 3600
                    workflow_date = workflow_date.replace('CET ', '')
                else:
                    raise Exception(
                        'Line "{}"" does not follow expected CEST or CET format'
                        .format(line))

                # Create a workflow based on observed tasks before starting a new.
                if tasks:
                    # Since we cannot trust the logs (our validator proved this)
                    # We will have to add both the children and parents manually to be safe.
                    for t in tasks:
                        for child_id in t.children:
                            task_by_id[child_id].parents.add(t.id)
                        for parent_id in t.parents:
                            task_by_id[parent_id].children.add(t.id)

                    workflow = get_workflow_from_tasks(tasks, workflow_start,
                                                       askalon_file,
                                                       workflow_index)

                    final_workflow_list.append(workflow)

                    final_task_list.extend(tasks)
                    final_taskstate_list.extend(task_state_list)

                    workflow_index += 1

                    tasks = []
                    task_state_list = []

                workflow_start = int(
                    ((datetime.strptime(workflow_date, DATETIME_FORMAT) -
                      EPOCH).total_seconds() - timezone_diff) * 1000)
                invalid_workflow = False  # new workflow begins, reset flag
            else:
                if invalid_workflow:  # skip reading tasks, advance to the next workflow
                    # print("Found invalid workflow, skipping")
                    invalid_workflow_count += 1
                    continue

                task, task_state = task_info_from_line(line, workflow_start,
                                                       workflow_index)

                if task:
                    if task.runtime < 0:
                        invalid_workflow = True
                        invalid_workflow_count += 1
                        tasks = []
                        task_state_list = []
                        continue

                    tasks.append(task)
                    task_by_id[task.id] = task
                    task_state_list.append(task_state)
                else:
                    invalid_workflow = True
                    invalid_workflow_count += 1
                    tasks = []
                    task_state_list = []

    print(workflow_index, invalid_workflow_count)

    # Flush the last workflow, if any.
    if tasks and not invalid_workflow:
        # Since we cannot trust the logs (our validator proved this)
        # We will have to add both the children and parents manually to be safe.
        for t in tasks:
            for child_id in t.children:
                task_by_id[child_id].parents.add(t.id)
            for parent_id in t.parents:
                task_by_id[parent_id].children.add(t.id)

        final_task_list.extend(tasks)
        final_taskstate_list.extend(task_state_list)

        workflow = get_workflow_from_tasks(tasks, workflow_start, askalon_file,
                                           workflow_index)
        final_workflow_list.append(workflow)

        workflow_index += 1

    task_df = pd.DataFrame([t.get_parquet_dict() for t in final_task_list])

    # Offset the first task arriving at 0 (and thus the thus the first workflow)
    min_ts_submit = task_df["ts_submit"].min()
    task_df["ts_submit"] = task_df["ts_submit"] - min_ts_submit

    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(),
                                    "part.0.parquet"),
                       engine="pyarrow")

    # Write task states
    task_state_df = pd.DataFrame(
        [ts.get_parquet_dict() for ts in final_taskstate_list])
    # offset the times
    task_state_df["ts_start"] = task_state_df["ts_start"] - min_ts_submit
    task_state_df["ts_end"] = task_state_df["ts_end"] - min_ts_submit

    os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()),
                exist_ok=True)
    task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(),
                                          "part.0.parquet"),
                             engine="pyarrow")

    # Write the workflow dataframe
    workflow_df = pd.DataFrame(
        [w.get_parquet_dict() for w in final_workflow_list])

    # Also offset workflow_df
    workflow_df["ts_submit"] = workflow_df["ts_submit"] - min_ts_submit

    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    workload_description = ""
    if "bwa" in askalon_file.lower():
        workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome."
    elif "wien2k" in askalon_file.lower():
        workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids."

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain="Engineering",
        authors=["Radu Prodan", "Alexandru Iosup"],
        workload_description=workload_description)

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
Ejemplo n.º 13
0
def parse(path_to_dir):
    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path_to_spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    # Convert times which are in microseconds and do not fit in a long to milliseconds
    convert_micro_to_milliseconds = F.udf(lambda x: x / 1000)

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\n Start parsing TaskState\n ######")
        task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
            os.path.join(path_to_dir, 'task_usage', '*.csv'))
        # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
        #     'fake_task_usage.csv')
        oldColumns = task_usage_df.schema.names
        newColumns = ["ts_start",
                      "ts_end",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "cpu_rate",
                      "memory_consumption",
                      "assigned_memory_usage",
                      "unmapped_page_cache",
                      "total_page_cache",
                      "max_memory_usage",
                      "mean_disk_io_time",
                      "mean_local_disk_space_usage",
                      "max_cpu_rate",
                      "max_disk_io_time",
                      "cycles_per_instruction",
                      "memory_accesses_per_instruction",
                      "sample_portion",
                      "aggregation_type",
                      "sampled_cpu_usage", ]

        task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                               range(len(oldColumns)), task_usage_df)

        # Drop columns with too low level details
        task_usage_df = task_usage_df.drop('memory_accesses_per_instruction')
        task_usage_df = task_usage_df.drop('cycles_per_instruction')
        task_usage_df = task_usage_df.drop('unmapped_page_cache')
        task_usage_df = task_usage_df.drop('total_page_cache')

        # Conver the timestamps from micro to milliseconds and cast them to long.
        task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start')))
        task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end')))
        task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType()))

        # Some fields have weird symbols in them, clean those.
        truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x)
        task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id')))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate')))

        # Now that the columns have been sanitized, cast them to the right type
        task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType()))

        task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite",
                                    compression="snappy")
        print("######\n Done parsing TaskState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):

        if 'task_usage_df' not in locals():
            task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path()))

        print("######\n Start parsing Tasks\n ######")
        task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST",
                                                                        parserLib="univocity").load(
            os.path.join(path_to_dir, 'task_events', '*.csv'))

        oldColumns = task_df.schema.names
        newColumns = ["ts_submit",
                      "missing_info",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "event_type",
                      "user_id",
                      "scheduler",
                      "nfrs",
                      "resources_requested",
                      "memory_requested",
                      "disk_space_request",
                      "machine_restrictions", ]

        task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                         range(len(oldColumns)), task_df)

        task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit')))
        task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType()))

        # Filter tasks that never reached completion
        task_df.createOrReplaceTempView("task_table")
        task_df = spark.sql("""WITH filtered_tasks AS (
        SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id
            FROM task_table t1
            WHERE t1.event_type IN(0, 1, 4)
            group by t1.workflow_id, t1.id
            having count(distinct event_type) = 3
        )
    SELECT t.*
    FROM task_table t INNER JOIN filtered_tasks f
    ON t.id = f.id AND t.workflow_id = f.workflow_id""")

        task_aggregation_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("type", T.StringType(), True),
            T.StructField("ts_submit", T.LongType(), True),
            T.StructField("submission_site", T.LongType(), True),
            T.StructField("runtime", T.LongType(), True),
            T.StructField("resource_type", T.StringType(), True),
            T.StructField("resource_amount_requested", T.DoubleType(), True),
            T.StructField("parents", T.ArrayType(T.LongType()), True),
            T.StructField("children", T.ArrayType(T.LongType()), True),
            T.StructField("user_id", T.LongType(), True),
            T.StructField("group_id", T.LongType(), True),
            T.StructField("nfrs", T.StringType(), True),
            T.StructField("wait_time", T.LongType(), True),
            T.StructField("params", T.StringType(), True),
            T.StructField("memory_requested", T.DoubleType(), True),
            T.StructField("network_io_time", T.DoubleType(), True),
            T.StructField("disk_space_requested", T.DoubleType(), True),
            T.StructField("energy_consumption", T.DoubleType(), True),
            T.StructField("resource_used", T.StringType(), True),
        ])

        # Compute based on the event type
        @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_aggregated_task_usage_metrics(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True)
            task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True)
            task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True)

            if None in [task_start_time, task_submit_time, task_finish_time]:
                return None

            task_resource_request = df['resources_requested'].max(skipna=True)
            task_memory_request = df['memory_requested'].max(skipna=True)
            task_priority = df['nfrs'].max(skipna=True)
            task_disk_space_requested = df['disk_space_request'].max(skipna=True)

            task_machine_id_list = df.resource_id.unique()

            task_waittime = int(task_start_time) - int(task_submit_time)
            task_runtime = int(task_finish_time) - int(task_start_time)

            def default(o):
                if isinstance(o, np.int64):
                    return int(o)

            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "type": "",  # Unknown
                "ts_submit": task_submit_time,
                "submission_site": -1,  # Unknown
                "runtime": task_runtime,
                "resource_type": "core",  # Fields are called CPU, but they are core count (see Google documentation)
                "resource_amount_requested": task_resource_request,
                "parents": [],
                "children": [],
                "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0],
                "group_id": -1,
                "nfrs": json.dumps({"priority": task_priority}, default=default),
                "wait_time": task_waittime,
                "params": "{}",
                "memory_requested": task_memory_request,
                "network_io_time": -1,  # Unknown
                "disk_space_requested": task_disk_space_requested,
                "energy_consumption": -1,  # Unknown
                "resource_used": json.dumps(task_machine_id_list, default=default),
            }

            return pd.DataFrame(data_dict, index=[0])

        task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics)
        task_df.explain(True)

        # Now add disk IO time - This cannot be done in the previous Pandas UDF function as
        # accessing another dataframe in the apply function is not allowed
        disk_io_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("disk_io_time", T.DoubleType(), True),
        ])

        @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_disk_io_time(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000
            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "disk_io_time": disk_io_time
            }

            return pd.DataFrame(data_dict, index=[0])

        disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy(
            ["workflow_id", "id"]).apply(compute_disk_io_time)
        disk_io_df.explain(True)

        join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id)
        task_df = task_df.join(disk_io_df, ["workflow_id", "id"])

        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy")
        print("######\n Done parsing Tasks\n ######")
    else:
        task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path()))

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\n Start parsing Resource\n ######")
        # Parse the machine information in the traces, these should match with the resource_ids in task_usage
        resources_structtype = T.StructType([  # Using StringTypes as we drop those columns
            T.StructField("time", T.StringType(), False),
            T.StructField("id", T.LongType(), False),
            T.StructField("attribute_name", T.StringType(), False),
            T.StructField("attribute_value", T.StringType(), False),
            T.StructField("attribute_deleted", T.StringType(), False),
        ])

        resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv'))

        resource_df = resource_df.select(["id"])  # Only keep the ID, the rest we do not need.

        # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al.
        # in their ATC 2018 article.
        resource_df = resource_df.withColumn('type', F.lit("core"))
        resource_df = resource_df.withColumn('num_resources', F.lit(8))
        resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona"))
        resource_df = resource_df.withColumn('memory', F.lit(-1))
        resource_df = resource_df.withColumn('disk_space', F.lit(-1))
        resource_df = resource_df.withColumn('network', F.lit(-1))
        resource_df = resource_df.withColumn('os', F.lit(""))
        resource_df = resource_df.withColumn('details', F.lit("{}"))

        # Write the resource_df to the specified location
        resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Resource\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())):
        print("######\n Start parsing ResourceState\n ######")
        resource_events_structtype = T.StructType([
            T.StructField("timestamp", T.DecimalType(20, 0), False),
            T.StructField("machine_id", T.LongType(), False),
            T.StructField("event_type", T.IntegerType(), False),
            T.StructField("platform_id", T.StringType(), False),
            T.StructField("available_resources", T.FloatType(), False),
            T.StructField("available_memory", T.FloatType(), False),
        ])

        resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv'))

        resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp')))
        resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType()))

        resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1))

        # Write the resource_df to the specified location
        resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite",
                                        compression="snappy")
        print("######\n Done parsing ResourceState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\n Start parsing Workflows\n ######")
        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1  # We do not know the task dependencies, so -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = ""
            total_resources = df['resource_amount_requested'].sum()  # TODO or assigned?
            total_memory_usage = df['memory_requested'].sum()  # TODO or consumption, or assigned?
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id, "ts_submit": ts_submit, 'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler,
                'total_resources': total_resources, 'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        # Create and write the workflow dataframe
        workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats)

        workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Workflows\n ######")

    print("######\n Start parsing Workload\n ######")
    json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df,
                                                                 domain="Industrial",
                                                                 start_date="2011-05-01",
                                                                 end_date="2011-05-30",
                                                                 authors=["Google"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)
    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
    print("######\n Done parsing Workload\n ######")