def parse_workload(pegasus_db_path, filename=NAME, workload_domain="", workload_description=""): if not os.path.exists(TARGET_DIR): os.makedirs(TARGET_DIR) conn = sqlite3.connect(pegasus_db_path) c = conn.cursor() workflows = parse_workflows(c) for w in workflows: w.compute_critical_path() # Write the workflow objects to parquet os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame([workflow.get_parquet_dict() for workflow in workflows]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for wf in workflows for task in wf.tasks]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time) pyarrow_task_schema = Task.get_pyarrow_schema() table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False) # Pandas does not know the different between an empty list and a list with integers # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly # using a schema. pq.write_table(table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet")) # generate workload description authors_list = [] w = Workload(workflows, workload_domain, authors_list, workload_description) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe(task_df, domain="Scientific", authors=["Pegasus Team"], workload_description="" # TODO fill in ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default)) conn.close()
def parse(source_dir, interarrival_times): inter_arrivals = [] with open(interarrival_times, 'r') as arrival_file: for line in arrival_file: inter_arrivals.append(int(line)) dax_files = os.listdir(source_dir) files_to_read = [] arrival_time = 0 for index, f in enumerate(sorted(dax_files)): # Ilyushkin et al. parsed the first two hundred DAX files. # The folder should contain 200 files, but one folder contains more than 200 files, # of which just the first 200 were used by Ilyushkin et al. if index == 200: break files_to_read.append(read_file(source_dir, f, index, arrival_time, TARGET_DIR)) arrival_time += inter_arrivals[index] meta_dict = Workflow.get_parquet_meta_dict() meta_dict["id"] = np.int64 # Add the id because we are not using a grouped dataframe here. meta_dict = OrderedDict(sorted(meta_dict.items(), key=lambda t: t[0])) wta_tasks = pd.read_parquet(os.path.join(TARGET_DIR, Task.output_path())) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe(wta_tasks, domain="Scientific", authors=["Alexey Ilyushkin", "Ahmed Ali-Eldin", "Nikolas Herbst", "Alessandro Vittorio Papadopoulos", "Bogdan Ghit", "Dick H. J. Epema", "Alexandru Iosup"], workload_description="This workload was used in the 2017 ICPE paper titles \"An experimental performance evaluation of autoscaling policies for complex workflows\" by Ilyushkin et al. It features a combination of LIGO, SIPHT, and Montage executes on the DAS5 supercomputer." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))
def parse(root, workflowhub_file_path_from_root): path = os.path.join(root, workflowhub_file_path_from_root) # Set the target dir to use the same folder structure as WorkflowHub. global TARGET_DIR TARGET_DIR = os.path.join(os.path.dirname(os.getcwd()), 'output_parquet', 'workflowhub', workflowhub_file_path_from_root.rstrip(".json")) with open(path) as json_file: json_data = json.load(json_file) authors = [json_data['author']['name']] meta_dict = Workflow.get_parquet_meta_dict() meta_dict["id"] = np.int64 # Add the id because we are not using a grouped dataframe here. meta_dict = OrderedDict(sorted(meta_dict.items(), key=lambda t: t[0])) workflow_df = dd.from_delayed(parse_and_return_task_dataframe(path), meta=meta_dict) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path()), engine="pyarrow", compute=True) wta_tasks = dd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()), engine="pyarrow", index=False) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_dask_task_dataframe(wta_tasks, domain="Scientific", authors=authors, workload_description="Workload downloaded from WorkflowHub, see http://workflowhub.isi.edu/." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))
def parse(gwf_filename): os.makedirs(TARGET_DIR, exist_ok=True) gwf_tasks = pd.read_csv(gwf_filename, skipinitialspace=True, dtype={ "WorkflowID": np.int64, "JobID": np.int64, "SubmitTime": np.int64, "Runtime": np.int64, "NProcs": np.int32, "Dependencies": np.str, }) del gwf_tasks["ReqNProcs "] gwf_tasks.columns = [ "workflow_id", "id", "ts_submit", "runtime", "resource_amount_requested", "dependencies" ] gwf_tasks['resource_amount_requested'] = gwf_tasks[ 'resource_amount_requested'].astype(np.float64) gwf_tasks = gwf_tasks.assign(ts_submit=lambda x: x['ts_submit'] * 1000 ) # Convert the submit time to milliseconds. gwf_tasks_with_parents = gwf_tasks.assign( parents=gwf_tasks["dependencies"].str.split().apply( lambda l: [np.int64(i) for i in l] if type(l) is list else [])) del gwf_tasks_with_parents[ "dependencies"] # We need to recompute these (and the column name is wrong), so delete. # Add columns not present in the trace. gwf_tasks_with_parents["type"] = np.str("composite") gwf_tasks_with_parents["resource_type"] = np.str("thread") gwf_tasks_with_parents["submission_site"] = np.int32(0) gwf_tasks_with_parents["user_id"] = np.int32(-1) gwf_tasks_with_parents["group_id"] = np.int32(-1) gwf_tasks_with_parents["nfrs"] = np.str(" ") gwf_tasks_with_parents["wait_time"] = np.int64(-1) gwf_tasks_with_parents["params"] = np.str("{}") gwf_tasks_with_parents["memory_requested"] = np.int64(-1) gwf_tasks_with_parents["disk_io_time"] = np.int64(-1) gwf_tasks_with_parents["disk_space_requested"] = np.int64(-1) gwf_tasks_with_parents["energy_consumption"] = np.int64(-1) gwf_tasks_with_parents["network_io_time"] = np.int64(-1) gwf_tasks_with_parents["resource_used"] = np.str("[]") # We need to make sure that all pandas dataframes follow the same column order. # The dask dataframe is build up using different pandas dataframes. So they must match. gwf_tasks_with_parents = gwf_tasks_with_parents[sorted( gwf_tasks_with_parents.columns)] gwf_tasks_with_children = gwf_tasks_with_parents.groupby("workflow_id").apply(compute_children) \ .reset_index(drop=True) # Make sure the first task has ts_submit of zero. min_submit_time = gwf_tasks_with_children["ts_submit"].min() task_df_final = gwf_tasks_with_children.assign( ts_submit=lambda x: x['ts_submit'] - min_submit_time) os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df_final.to_parquet( os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet")) # Compute workflow properties specific to this trace workflow_df = task_df_final.groupby("workflow_id").apply( compute_workflow_features).reset_index(drop=True) workflow_df = workflow_df.rename(columns={"workflow_id": "id"}) workflow_df = workflow_df[sorted(workflow_df.columns)] os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df.to_parquet( os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet")) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df_final, domain="Industrial", authors=[ "Shenjun Ma", "Alexey Ilyushkin", "Alexander Stegehuis", "Alexandru Iosup" ], workload_description= "Chronos is a trace from Shell's Chronos IoT production system. It contains pipelines where sensor data is obtained, checked if values are within range (e.g. temperature, operational status, etc.), and the outcomes are written to persistent storage." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))
def parse(): os.makedirs(TARGET_DIR, exist_ok=True) task_counter = 0 workflow_counter = 0 processed_workflows = [] final_workflows = [] final_tasks = [] task_offset = 0 workflow_offset = None for wi_row in WORKFLOW_INVOCATIONS.itertuples(): flag = False # only use one execution of a workflow if wi_row[4] in processed_workflows: continue # check if workflow contains cycles workflow_row = WORKFLOWS.loc[(WORKFLOWS["id"] == getattr( wi_row, "workflow_id"))] if workflow_row.iloc[0]["has_cycles"] == "t": continue # workflows contain a number of workflow steps but this is not the ID of their actual execution # this list is used to tie the workflow steps to their actual execution ID step_job_ids = [] tasks_in_workflow = [] workflow_index = wi_row[4] # check if workflow id is null if pd.isnull(workflow_index): continue df = WORKFLOW_INVOKE_STEPS.loc[( WORKFLOW_INVOKE_STEPS["workflow_invocation_id"] == getattr( wi_row, "id"))] # check if workflow is not empty if df.empty: processed_workflows.append(workflow_index) continue for wis_row in df.itertuples(): # check if entry in WF_INVOKE_STEPS has the same wf_invocation_id if getattr(wis_row, "workflow_invocation_id") == getattr(wi_row, "id"): # check if required fields are not empty if check_if_empty(getattr(wis_row, "workflow_step_id"), getattr(wis_row, "job_id")): processed_workflows.append(workflow_index) flag = True break # get step id and corresponding execution id step_job_pair = [ getattr(wis_row, "workflow_step_id"), getattr(wis_row, "job_id") ] step_job_ids.append(step_job_pair) job_id = getattr(wis_row, "job_id") submit_time = int(((datetime.strptime( getattr(wis_row, "create_time"), DATETIME_FORMAT) - EPOCH).total_seconds()) * 1000) job_metrics = METRICS.loc[(METRICS["job_id"] == job_id)] runtime = job_metrics.loc[ (job_metrics["metric_name"] == "runtime_seconds"), 'metric_value'] * 1000 memory = job_metrics.loc[(job_metrics["metric_name"] == "memory.memsw.max_usage_in_bytes"), 'metric_value'] cpu_time = job_metrics.loc[( job_metrics["metric_name"] == "cpuacct.usage"), 'metric_value'] # check if any required fields are empty if runtime.empty or memory.empty or cpu_time.empty: processed_workflows.append(workflow_index) flag = True break # used to find the task with lowest submit time, this time will be used ass offset if task_offset == 0: task_offset = submit_time elif submit_time < task_offset: task_offset = submit_time runtime = runtime.iloc[0] memory = memory.iloc[0] cpu_time = cpu_time.iloc[0] / 1000000 if cpu_time > runtime: cpu_time = runtime task = Task(np.int64(job_id), "Composite", submit_time, 0, runtime, 1, None, workflow_index, -1, "cpu-time", resource=cpu_time, memory_requested=memory) task_counter += 1 tasks_in_workflow.append(task) flag = False # if flag is true, a task in the workflow is not usable to we skip it if flag: processed_workflows.append((workflow_index)) continue # compute children of tasks final_tasks.extend(compute_children(step_job_ids, tasks_in_workflow)) workflow_submit_time = int( ((datetime.strptime(getattr(wi_row, "create_time"), DATETIME_FORMAT) - EPOCH).total_seconds()) * 1000) # find smallest workflow submit time as offset if workflow_offset is None: workflow_offset = workflow_submit_time elif workflow_submit_time < workflow_offset: workflow_offset = workflow_submit_time workflow = Workflow(workflow_index, workflow_submit_time, tasks_in_workflow, "core", "Engineering", "Galaxy", "Biological Engineering") workflow.compute_critical_path() processed_workflows.append(workflow_index) final_workflows.append(workflow) workflow_counter += 1 # apply offset for x in final_tasks: x.ts_submit = x.ts_submit - task_offset # apply offset for y in final_workflows: y.ts_submit = y.ts_submit - workflow_offset # make tasks dataframe task_df = pd.DataFrame([t.get_parquet_dict() for t in final_tasks]) # create parquet file in specified folder os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"), engine="pyarrow") # make workflows dataframe workflow_df = pd.DataFrame([w.get_parquet_dict() for w in final_workflows]) # create parquet file in specified folder os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain="Biological Engineering", authors=["Jaro Bosch", "Laurens Versluis"], workload_description= "Traces from different biomedical research workflows, executed on the public Galaxy server in Europe." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))
def parse_and_return_task_dataframe(file_path): global TARGET_DIR with open(file_path) as trace: json_data = json.load(trace) workflow = json_data['workflow'] tasks = workflow['jobs'] machines = workflow['machines'] date = json_data['createdAt'] # Convert the ts_submit to seconds instead of a datetime string task_date = dateparser.parse(date) EPOCH = datetime(1970, 1, 1, tzinfo=task_date.tzinfo) ts_submit = int((task_date - EPOCH).total_seconds() * 1000) resource_by_id = dict() for machine in machines: machine_id = mmh3.hash64("machine:{0}".format(machine['machine_code'].strip()))[0] machine = machine["machine"] num_cpus = machine['cpu']['count'] details = { "cpu_vendor": machine['cpu']['vendor'], "architecture": machine['architecture'] } memory_in_gb = int(machine['memory']) / float(1024 * 1024) res = Resource(machine_id, "cluster_node", num_cpus, machine['release'], memory_in_gb, -1, -1, machine['system'], details) resource_by_id[machine_id] = res task_list = [] task_state_list = [] inputs_per_taskid = dict() outputs_per_taskid = dict() outputs_matched = dict() task_per_taskid = dict() input_file_data_per_task_id = dict() output_file_data_per_task_id = dict() for task in tasks: task_id = mmh3.hash64("task:{}".format(str(task['name']).strip()))[0] print(task_id) task_files = task['files'] if 'files' in task else [] task_type = task['type'] task_cores = task['cores'] if 'cores' in task else 1 task_memory = task['memory'] if 'memory' in task else -1 task_runtime = task['runtime'] * 1000 if 'runtime' in task else -1 task_dependencies = [mmh3.hash64("task:{}".format(str(p).strip()))[0] for p in task['parents']] task_parameters = {"arguments": task['arguments']} if 'arguments' in task else {} task_machine = mmh3.hash64("machine:{0}".format(task['machine'].strip()))[0] if 'machine' in task else None task_resource = resource_by_id[task_machine].id if 'machine' in task else -1 # Convert energy to Wh from KWh task_total_energy_consumption = float(task['energy']) * 1000 if 'energy' in task else -1 t = Task(task_id, task_type, ts_submit, -1, task_runtime, task_cores, task_dependencies, 0, -1, params=task_parameters, resource=task_resource, energy_consumption=task_total_energy_consumption, resource_type="core") task_per_taskid[task_id] = t task_list.append(t) # Parse the data transfers for file_item in task_files: # Apparently not all traces were parsed into version 0.2 despite them being in the # folders for 0.2. To this end we need a check for the file name and size fields. file_name = file_item['name'] if 'name' in file_item else file_item['fileId'] file_size = file_item['size'] if 'size' in file_item else -1 # Store the incoming and outgoing data to this task in separate dicts if file_item['link'] == "input": if task_id not in inputs_per_taskid: inputs_per_taskid[task_id] = set() input_file_data_per_task_id[task_id] = dict() inputs_per_taskid[task_id].add(file_name) try: input_file_data_per_task_id[task_id][file_name] = file_size except: print(file_item) exit(-1) elif file_item['link'] == "output": if task_id not in outputs_per_taskid: outputs_per_taskid[task_id] = set() outputs_matched[task_id] = dict() output_file_data_per_task_id[task_id] = dict() output_file_data_per_task_id[task_id] = dict() outputs_per_taskid[task_id].add(file_name) outputs_matched[task_id][file_name] = False output_file_data_per_task_id[task_id][file_name] = file_size # Create a task state for the entire duration with task_state = TaskState(ts_submit, ts_submit + task_runtime, 0, task_id, -1, canonical_memory_usage=task_memory) task_state_list.append(task_state) # Make sure the earliest task starts at 0. min_ts_submit = min(task.ts_submit for task in task_list) for task in task_list: # Update the time task.ts_submit -= min_ts_submit for parent in task.parents: # Also since we have all parent info, set them in the same loop task_per_taskid[parent].children.add(task.id) # Offset task states too for taskstate in task_state_list: taskstate.ts_start -= min_ts_submit taskstate.ts_end -= min_ts_submit data_transfer_id = 0 # Since tasks can output files with the same name as other tasks, we must loop over a task's parents # and match the output names against input names. for task in task_list: # For every task we have if task.id not in inputs_per_taskid: continue inputs = inputs_per_taskid[task.id] # We loop over the parents (no need to check children, they will come later) for dep in task.parents: outputs = outputs_per_taskid[dep] if dep in outputs_per_taskid else set() overlap = inputs.intersection(outputs) # Check for overlap if len(overlap) > 0: # We have input-output pairs, loop to construct datatransfers for file_name in overlap: # Get the size and construct a datatransfer object. data_size = output_file_data_per_task_id[dep][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id, data_size) # Assign it to the tasks task_per_taskid[dep].datatransfers.append(datatransfer) task.datatransfers.append(datatransfer) outputs_matched[dep][file_name] = True # Remove the file from the input as it's covered. Do NOT remove it from output, # the same output file may be used by another task (fan-out structure). inputs.remove(file_name) data_transfer_id += 1 # Loop over the remaining input files. Since we do not have a source, we assume them are present # on the filesystem beforehand. for file_name in inputs: data_size = input_file_data_per_task_id[task.id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) data_transfer_id += 1 # Loop over the outputs and create a datatransfer for those that are not matched yet # These are likely files with final results, not having an destination. for task_id in outputs_matched.keys(): for file_name in outputs_matched[task_id].keys(): if not outputs_matched[task_id][file_name]: task = task_per_taskid[task_id] data_size = output_file_data_per_task_id[task_id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) filename_for_this_partition = "part.0.parquet" # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list]) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), filename_for_this_partition), engine="pyarrow") # Write all task states to parquet os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df = pd.DataFrame([task_state.get_parquet_dict() for task_state in task_state_list]) task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), filename_for_this_partition), engine="pyarrow") # Write all data transfers to parquet if any(len(task.datatransfers) for task in task_list): os.makedirs(os.path.join(TARGET_DIR, Datatransfer.output_path()), exist_ok=True) datatransfer_df = pd.DataFrame( [datatransfer.get_parquet_dict() for task_item in task_list for datatransfer in task_item.datatransfers]) datatransfer_df.to_parquet( os.path.join(TARGET_DIR, Datatransfer.output_path(), filename_for_this_partition), engine="pyarrow") # Write the workflows to parquet wf_agnostic_df = compute_characteristics(task_df) workflow_ts_submit = task_df["ts_submit"].min() # Determine the application name and field application_names = { "epigenomics": ("Epigenomics", "Bioinformatics"), "montage": ("Montage", "Astronomy"), "soykb": ("SoyKB", "Bioinformatics"), } application_name = "" application_field = "" for key in application_names.keys(): if key in file_path: application_name = application_names[key][0] application_field = application_names[key][1] workflow = Workflow(0, workflow_ts_submit, task_list, "Pegasus", "Scientific", application_name, application_field) workflow.compute_critical_path() wf_df = pd.DataFrame([workflow.get_parquet_dict()]) return wf_df
def parse(path_to_dir): global TARGET_DIR TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1]) if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .config("spark.sql.execution.arrow.enabled", "true") \ .getOrCreate() else: findspark.init(spark_home="<path to spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): print("######\nStart parsing Tasks\n######") task_df = spark.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( os.path.join(path_to_dir, '*.csv.processed')) # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable task_df = task_df.drop('pref').filter( task_df.status == ":instance.status/success").drop( 'status').cache() @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def sub_two_datetimes(s1, s2): arr = [] for i in s1.keys(): d1 = datetime.datetime.strptime(s1[i], '%a %b %d %H:%M:%S %Z %Y') d2 = datetime.datetime.strptime(s2[i], '%a %b %d %H:%M:%S %Z %Y') arr.append(int((d2 - d1).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df \ .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \ .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time'))) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def date_time_to_unix(series): arr = [] epoch = datetime.datetime.utcfromtimestamp(0) for i in series.keys(): arr.append( np.int64((datetime.datetime.strptime( series[i], '%a %b %d %H:%M:%S %Z %Y') - epoch).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df.withColumn( 'submit-time', date_time_to_unix(F.col('submit-time'))).withColumnRenamed( 'submit-time', "ts_submit").drop('start-time').drop('end-time').cache() min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0] task_df = task_df.withColumn('ts_submit', F.col('ts_submit') - F.lit(min_ts)) @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR) def convert_to_kb(v): return v * 1024 task_df = task_df.withColumn('memory', convert_to_kb( task_df.memory)).withColumnRenamed("memory", "memory_consumption") @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR) def string_to_int(v): arr = [] for i in v.keys(): arr.append(mmh3.hash(v[i], signed=True)) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def string_to_long(v): arr = [] for i in v.keys(): arr.append(mmh3.hash64(v[i], signed=True)[0]) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def assign_workflow_ids(v): arr = [] for i in v.keys(): if v[i]: arr.append(mmh3.hash64(v[i], signed=True)[0]) else: arr.append( mmh3.hash64(uuid4().bytes, signed=True) [0]) # Assign a UUID, collision chance is negligible. return pd.Series(arr) task_df = task_df.withColumn('user', string_to_int( task_df.user)).withColumnRenamed("user", "user_id") task_df = task_df.withColumn('job-uuid', string_to_long( F.col('job-uuid'))).withColumnRenamed( 'job-uuid', 'task_id') type_udf = F.udf(lambda x: "Independent" if x is None else "Composite", T.StringType()) task_df = task_df.withColumn('type', type_udf(task_df.simset)) task_df = task_df.withColumn('simset', assign_workflow_ids( F.col('simset'))).withColumnRenamed( 'simset', "workflow_id") task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested') task_df = task_df.withColumnRenamed('instance', 'resource_used') # Set the static items that are not present in the trace task_df = task_df.withColumn('submission_site', F.lit(0)) task_df = task_df.withColumn('parents', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('children', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('group_id', F.lit(0)) task_df = task_df.withColumn('nfrs', F.lit("{}")) task_df = task_df.withColumn('params', F.lit("{}")) task_df = task_df.withColumn('memory_requested', F.lit(-1)) task_df = task_df.withColumn('network_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_space_requested', F.lit(-1)) task_df = task_df.withColumn('energy_consumption', F.lit(-1)) os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Tasks\n######") if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\nStart parsing TaskState\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) task_state_structtype = T.StructType([ T.StructField("ts_start", T.LongType(), False), T.StructField("ts_end", T.LongType(), False), T.StructField("workflow_id", T.LongType(), False), T.StructField("task_id", T.LongType(), False), T.StructField("resource_id", T.LongType(), False), T.StructField("cpu_rate", T.DoubleType(), False), T.StructField("canonical_memory_usage", T.DoubleType(), False), T.StructField("assigned_memory", T.DoubleType(), False), T.StructField("minimum_memory_usage", T.DoubleType(), False), T.StructField("maximum_memory_usage", T.DoubleType(), False), T.StructField("disk_io_time", T.DoubleType(), False), T.StructField("maximum_disk_bandwidth", T.DoubleType(), False), T.StructField("local_disk_space_usage", T.DoubleType(), False), T.StructField("maximum_cpu_rate", T.DoubleType(), False), T.StructField("maximum_disk_io_time", T.DoubleType(), False), T.StructField("sample_rate", T.DoubleType(), False), T.StructField("sample_portion", T.DoubleType(), False), T.StructField("sampled_cpu_usage", T.DoubleType(), False), T.StructField("network_io_time", T.DoubleType(), False), T.StructField("maximum_network_bandwidth", T.DoubleType(), False), ]) @F.pandas_udf(returnType=task_state_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_task_states(df): workflow_id = df['workflow_id'].iloc[0] task_id = df['task_id'].iloc[0] ts_start = df['ts_submit'].min() ts_end = ts_start + df['runtime'].max() resource_id = df['resource_used'].iloc[0] cpu_rate = -1 canonical_memory_usage = df['memory_consumption'].mean() assigned_memory = -1 minimum_memory_usage = df['memory_consumption'].min() maximum_memory_usage = df['memory_consumption'].max() disk_io_time = -1 maximum_disk_bandwidth = -1 local_disk_space_usage = -1 maximum_cpu_rate = -1 maximum_disk_io_time = -1 sample_rate = -1 sample_portion = -1 sampled_cpu_usage = -1 network_io_time = -1 maximum_network_bandwidth = -1 data_dict = { "ts_start": ts_start, "ts_end": ts_end, "workflow_id": workflow_id, "task_id": task_id, "resource_id": resource_id, "cpu_rate": cpu_rate, "canonical_memory_usage": canonical_memory_usage, "assigned_memory": assigned_memory, "minimum_memory_usage": minimum_memory_usage, "maximum_memory_usage": maximum_memory_usage, "disk_io_time": disk_io_time, "maximum_disk_bandwidth": maximum_disk_bandwidth, "local_disk_space_usage": local_disk_space_usage, "maximum_cpu_rate": maximum_cpu_rate, "maximum_disk_io_time": maximum_disk_io_time, "sample_rate": sample_rate, "sample_portion": sample_portion, "sampled_cpu_usage": sampled_cpu_usage, "network_io_time": network_io_time, "maximum_network_bandwidth": maximum_network_bandwidth, } return pd.DataFrame(data_dict, index=[0]) task_state_df = task_df.groupBy(['workflow_id', 'task_id']).apply(compute_task_states) os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing TaskState\n######") if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\nStart parsing Resources\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) resource_id_column = [ i.resource_used for i in task_df.select('resource_used').distinct().collect() ] resources = [] for resource_id in resource_id_column: resources.append( Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1, '').get_parquet_dict()) resource_df = pd.DataFrame(resources) os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()), exist_ok=True) resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(), 'part.0.parquet'), engine="pyarrow") print("######\nDone parsing Resources\n######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\nStart parsing Workflows\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "Cook" total_resources = df['resource_amount_requested'].sum() total_memory_usage = df['memory_consumption'].sum() total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) workflow_df = task_df.groupBy('workflow_id').apply( compute_workflow_stats) workflow_df.explain(True) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Workflows\n######") print("######\nStart parsing Ẁorkload\n######") pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()), engine="pyarrow") json_dict = Workload.get_json_dict_from_pandas_task_dataframe( pandas_task_df, domain="Industrial", start_date=None, end_date=None, authors=["Two Sigma"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))
def read_file(source_dir, dax_file, count, arrival_time, target_dir): tree = ET.parse(os.path.join(source_dir, dax_file)) adag = tree.getroot() dependencies = {} def id_to_int(id): return int(non_decimal.sub('', id)) for child in adag.findall('{http://pegasus.isi.edu/schema/DAX}child'): task_id = mmh3.hash64("workflow:{}_task:{}".format(count, id_to_int(child.attrib['ref'])))[0] if task_id not in dependencies: dependencies[task_id] = [] for parent in child: parent_id = id_to_int(parent.attrib['ref']) dependencies[task_id].append(parent_id) tasks = adag.findall('{http://pegasus.isi.edu/schema/DAX}job') task_list = [] inputs_per_taskid = dict() # Contains input file names per task id outputs_per_taskid = dict() # Contains output file names per task id outputs_matched = dict() # A dictionary to check of outputs have been matched with an input input_file_data_per_task_id = dict() # Dict with file sizes per file name for input files output_file_data_per_task_id = dict() # Dict with file sizes per file name for output files task_per_taskid = dict() for task in tasks: # Since all tasks start at 0, in every file, we add the workflow id and hash it to get task id task_id = mmh3.hash64("workflow:{}_task:{}".format(count, id_to_int(task.attrib['id'])))[0] # Ilyushkin et al. added an attribute field called profile, containing the runtime of that particular task. runtime = int(float(task.attrib['runtime']) * 1000) if task_id in dependencies: task_dependencies = [mmh3.hash64("workflow:{}_task:{}".format(count, dependency))[0] for dependency in dependencies[task_id]] else: task_dependencies = [] # Site ids are unknown, all tasks in the work to use 1 core. t = Task(task_id, "", arrival_time, 0, runtime, 1, task_dependencies, count, -1, resource_type="core", resource=-1) task_list.append(t) task_per_taskid[task_id] = t # Parse the data transfers for data_item in task.findall("{http://pegasus.isi.edu/schema/DAX}uses"): # Store the incoming and outgoing data to this task in separate dicts if data_item.attrib['link'] == "input": if task_id not in inputs_per_taskid: inputs_per_taskid[task_id] = set() input_file_data_per_task_id[task_id] = dict() inputs_per_taskid[task_id].add(data_item.attrib['file']) input_file_data_per_task_id[task_id][data_item.attrib['file']] = data_item.attrib['size'] elif data_item.attrib['link'] == "output": if task_id not in outputs_per_taskid: outputs_per_taskid[task_id] = set() outputs_matched[task_id] = dict() output_file_data_per_task_id[task_id] = dict() output_file_data_per_task_id[task_id] = dict() outputs_per_taskid[task_id].add(data_item.attrib['file']) outputs_matched[task_id][data_item.attrib['file']] = False output_file_data_per_task_id[task_id][data_item.attrib['file']] = data_item.attrib['size'] # Set children for task in task_list: for parent_id in task.parents: task_per_taskid[parent_id].children.add(task.id) data_transfer_id = 0 # Since tasks can output files with the same name as other tasks, we must loop over a task's parents # and match the output names against input names. for task in task_list: # For every task we have inputs = inputs_per_taskid[task.id] for dep in task.parents: # We loop over the parents (no need to check children, they will come later) outputs = outputs_per_taskid[dep] overlap = inputs.intersection(outputs) # Check for overlap if len(overlap) > 0: # We have input - output pairs, loop over them to construct datatransfers for file_name in overlap: # Get the size and construct a datatransfer object. data_size = output_file_data_per_task_id[dep][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id, data_size) # Assign it to the tasks task_per_taskid[dep].datatransfers.append(datatransfer) task.datatransfers.append(datatransfer) outputs_matched[dep][file_name] = True # Remove the file from the input as it's covered. Do NOT remove it from output, the same output # file may be used by another task (fan-out structure). inputs.remove(file_name) data_transfer_id += 1 # Now, loop over the remaining input files. Since we do not have a source, we assume them are present # on the filesystem beforehand. for file_name in inputs: data_size = input_file_data_per_task_id[task.id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) data_transfer_id += 1 # Loop over the outputs and create a datatransfer for those that are not matched yet # These are likely files with final results, not having an destination. for task_id in outputs_matched.keys(): for file_name in outputs_matched[task_id].keys(): if not outputs_matched[task_id][file_name]: task = task_per_taskid[task_id] data_size = output_file_data_per_task_id[task_id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) filename_for_this_partition = "part.{}.parquet".format(count) os.makedirs(os.path.join(target_dir, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time) # Make sure the columns are in the right order task_df = task_df[sorted(task_df.columns)] task_df.to_parquet(os.path.join(target_dir, Task.output_path(), filename_for_this_partition), engine="pyarrow") os.makedirs(os.path.join(target_dir, Datatransfer.output_path()), exist_ok=True) datatransfer_df = pd.DataFrame( [datatransfer.get_parquet_dict() for task in task_list for datatransfer in task.datatransfers]) datatransfer_df.to_parquet(os.path.join(target_dir, Datatransfer.output_path(), filename_for_this_partition), engine="pyarrow") wf_agnostic_df = compute_characteristics(task_df) workflow_ts_submit = task_df["ts_submit"].min() application_names = { "_lig": ("LIGO", "Physics"), "_sip": ("SIPHT", "Bioinformatics"), "_mon": ("Montage", "Astronomy"), } application_name = "" application_field = "" for key in application_names.keys(): if key in dax_file: application_name = application_names[key][0] application_field = application_names[key][1] break workflow = Workflow(count, workflow_ts_submit, task_list, "", "Scientific", application_name, application_field) workflow.compute_critical_path() wf_df = pd.concat([wf_agnostic_df, pd.DataFrame( {"id": pd.Series([np.int64(count)], dtype=np.int64), "ts_submit": pd.Series([np.int64(workflow_ts_submit)], dtype=np.int64), "critical_path_length": pd.Series([np.int32(workflow.critical_path_length)], dtype=np.int32), "critical_path_task_count": pd.Series([np.int32(workflow.critical_path_task_count)], dtype=np.int32), "approx_max_concurrent_tasks": pd.Series([np.int32(workflow.max_concurrent_tasks)], dtype=np.int32), "scheduler": pd.Series([np.str("")], dtype=np.str), })], axis=1) wf_df["nfrs"] = np.str("{}") wf_df = wf_df[sorted(wf_df.columns)] os.makedirs(os.path.join(target_dir, Workflow.output_path()), exist_ok=True) wf_df.to_parquet(os.path.join(target_dir, Workflow.output_path(), filename_for_this_partition), engine="pyarrow")
def parse_lanl_file(lanl_file): task_list = [] task_by_id = {} df = pd.read_csv(lanl_file, parse_dates=["submission_time", "start_date", "end_date"], infer_datetime_format=True) task_df = df[df['object_event'] == "JOBEND"] earliest_date = df['submission_time'].min() latest_date = df["end_date"].max() for index, row in task_df.iterrows(): id = str( mmh3.hash64("task:{}".format(str(row["object_id"]).strip()))[0]) # Task time fields submission_time_task = row["submission_time"] start_time_task = row["start_date"] end_time_task = row["end_date"] # Task cpu consumption fields num_nodes = int(row["nodes_requested"]) num_cpus_per_node = row["dedicated_processors_per_task"] # Task dependency fields extension_string = str(row["resource_manager_extension_string"]) # Find dependencies match = re.search('DEPEND=([\w,:.]+);?', extension_string) if not match: dependencies = set() else: dependencies = match.group(1) dependencies = set( str(mmh3.hash64("task:{}".format(str(dep).strip()))[0]) for dep in dependencies.split("&")) task_wait_time = int( (start_time_task - submission_time_task).total_seconds() * 1000) task_runtime = int( (end_time_task - start_time_task).total_seconds() * 1000) task = Task(id, "Atomic", start_time_task, -1, task_runtime, num_nodes * num_cpus_per_node, dependencies, -1, task_wait_time, resource_type="core", resource=-1) # Convert the ts_submit to seconds instead of a datetime string EPOCH = datetime(1970, 1, 1, tzinfo=task.ts_submit.tzinfo) task.ts_submit = int((task.ts_submit - EPOCH).total_seconds() * 1000) # Set the wallclock limit task.nfrs["runtime_limit"] = row["wallclock_limit"] task_by_id[id] = task task_list.append(task) min_ts_submit = min(task.ts_submit for task in task_list) # For every task, add itself the the children of its parents for task in task_list: task.ts_submit -= min_ts_submit # Make sure the first task in the trace starts at 0 invalid_parents = set() for parent_id in task.parents: # Chop of the prepend of *: (e.g. jobsuccess:) actual_parent_id = parent_id[str(parent_id).find(":") + 1:] if actual_parent_id in task_by_id: # If this doesn't fire, the task probably failed, we filter those out. parent = task_by_id[actual_parent_id] parent.children.add(task.id) else: invalid_parents.add(parent_id) # Remove invalid parents if invalid_parents: task.parents -= invalid_parents # Find start tasks and assign workflow ids workflow_id = 0 for task in task_list: if task.workflow_id == -1: root_parents = task.get_root_parents(task_by_id) if root_parents: # If there are start tasks, propogate from them for root_parent_id in root_parents: actual_root_id = root_parent_id[str(root_parent_id). find(":") + 1:] task_by_id[actual_root_id].set_workflow_id_propagating( task_by_id, workflow_id) else: # Else it's a single job so just set the property directly task.workflow_id = workflow_id workflow_id += 1 # Now that every thing has been computed, we write the tasks to parquet files os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign( ts_submit=lambda x: x['ts_submit'] - min_submit_time) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"), engine="pyarrow") workflows = dict() # Based on workflow ids, constuct the workflow objects for task in task_list: if task.workflow_id in workflows: workflow = workflows[task.workflow_id] else: workflow = Workflow(workflow_id, None, [], "", "Scientific", "Uncategorized", "Uncategorized") workflows[task.workflow_id] = workflow if not workflow.ts_submit: workflow.ts_submit = task.ts_submit else: workflow.ts_submit = min(workflow.ts_submit, task.ts_submit) workflow.tasks.append(task) workflow.task_count = len(workflow.tasks) for w in workflows.values(): w.compute_critical_path(strip_colon=True) os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame( [workflow.get_parquet_dict() for workflow in workflows.values()]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain="Engineering", start_date=str(earliest_date), end_date=str(latest_date), authors=[ "George Amvrosiadis", "Jun Woo Park", "Gregory R. Ganger", "Garth A. Gibson", "Elisabeth Baseman", "Nathan DeBardeleben" ], workload_description= "This workload was published by Amvrosiadis et al. as part of their ATC 2018 paper titled \"On the diversity of cluster workloads and its impact on research results\". It is the Trinity trace from the Los Almos National Laboratory." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))
def parse_askalon_file(askalon_file): if not os.path.exists(TARGET_DIR): os.makedirs(TARGET_DIR) workflows = [] with open(askalon_file, 'r') as asklon_trace: data = json.load(asklon_trace) for wf in data: workflows.append(parse_workflow(wf, askalon_file)) for w in workflows: w.compute_critical_path() # Write the workflow objects to parquet os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame( [workflow.get_parquet_dict() for workflow in workflows]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame( [task.get_parquet_dict() for wf in workflows for task in wf.tasks]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign( ts_submit=lambda x: x['ts_submit'] - min_submit_time) pyarrow_task_schema = Task.get_pyarrow_schema() table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False) # Pandas does not know the different between an empty list and a list with integers # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly # using a schema. pq.write_table( table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet")) # generate workload description authors_list = ["Roland Matha", "Radu Prodan"] # generate workload description workload_description = "" if "bwa" in askalon_file.lower(): workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome." elif "wien2k" in askalon_file.lower(): workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids." workload_domain = "Scientific" w = Workload(workflows, workload_domain, authors_list, workload_description) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain=workload_domain, authors=authors_list, workload_description=workload_description) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))
def parse(path_to_dir): global TARGET_DIR TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[-1]) if "DAS5" in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ["HOSTNAME"] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "256G") \ .config("spark.driver.maxResultSize", "40G") \ .config("spark.network.timeout", "100000s") \ .config("spark.rpc.askTimeout", "100000s") \ .config("spark.default.parallelism", "2000") \ .config("spark.sql.execution.arrow.enabled", "true") \ .config("spark.cleaner.periodicGC.interval", "5s") \ .getOrCreate() else: import findspark findspark.init("<path_to_spark>") spark = SparkSession.builder \ .master("local[4]") \ .appName("WTA parser") \ .config("spark.executor.memory", "2G") \ .config("spark.driver.memory", "2G") \ .getOrCreate() machine_meta = spark.read.csv(os.path.join(path_to_dir, "machine_meta.csv"), schema=StructType([ StructField("machine_id", StringType(), True), StructField("time_stamp", LongType(), True), StructField("failure_domain_1", LongType(), True), StructField("failure_domain_2", StringType(), True), StructField("cpu_num", LongType(), True), StructField("mem_size", LongType(), True), StructField("status", StringType(), True) ])) machine_usage = spark.read.csv(os.path.join(path_to_dir, "machine_usage.csv"), schema=StructType([ StructField("machine_id", StringType(), True), StructField("time_stamp", DoubleType(), True), StructField("cpu_util_percent", LongType(), True), StructField("mem_util_percent", LongType(), True), StructField("mem_gps", DoubleType(), True), StructField("mkpi", LongType(), True), StructField("net_in", DoubleType(), True), StructField("net_out", DoubleType(), True), StructField("disk_io_percent", DoubleType(), True) ])) container_meta = spark.read.csv( os.path.join(path_to_dir, "container_meta.csv"), schema=StructType([ StructField("container_id", StringType(), True), StructField("machine_id", StringType(), True), StructField("time_stamp", LongType(), True), StructField("app_du", StringType(), True), StructField("status", StringType(), True), StructField("cpu_request", LongType(), True), StructField("cpu_limit", LongType(), True), StructField("mem_size", DoubleType(), True) ])) container_usage = spark.read.csv(os.path.join(path_to_dir, "container_usage.csv"), schema=StructType([ StructField("container_id", StringType(), True), StructField("machine_id", StringType(), True), StructField("time_stamp", DoubleType(), True), StructField("cpu_util_percent", LongType(), True), StructField("mem_util_percent", LongType(), True), StructField("cpi", DoubleType(), True), StructField("mem_gps", DoubleType(), True), StructField("mpki", LongType(), True), StructField("net_in", DoubleType(), True), StructField("net_out", DoubleType(), True), StructField("disk_io_percent", DoubleType(), True) ])) batch_task = spark.read.csv(os.path.join(path_to_dir, "batch_task.csv"), schema=StructType([ StructField("task_name", StringType(), True), StructField("instance_num", LongType(), True), StructField("job_name", StringType(), True), StructField("task_type", StringType(), True), StructField("status", StringType(), True), StructField("start_time", LongType(), True), StructField("end_time", LongType(), True), StructField("plan_cpu", DoubleType(), True), StructField("plan_mem", DoubleType(), True) ])) batch_instance = spark.read.csv( os.path.join(path_to_dir, "batch_instance.csv"), schema=StructType([ StructField("instance_name", StringType(), True), StructField("task_name", StringType(), True), StructField("job_name", StringType(), True), StructField("task_type", StringType(), True), StructField("status", StringType(), True), StructField("start_time", LongType(), True), StructField("end_time", LongType(), True), StructField("machine_id", StringType(), True), StructField("seq_no", LongType(), True), StructField("total_seq_no", LongType(), True), StructField("cpu_avg", DoubleType(), True), StructField("cpu_max", DoubleType(), True), StructField("mem_avg", DoubleType(), True), StructField("mem_max", DoubleType(), True) ])) @F.pandas_udf(returnType=Task.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def clean_tasks_of_workflow(df): tasks = dict() raw_id_to_instances = dict() job_name = df.loc[0, "job_name"] workflow_id = mmh3.hash64(job_name)[1] invalid_task_raw_ids = set() # group by task name # - count number of instances # - compare with row.instance_num # Check to inspect if the data is noisy # def check(pdf): # a = pdf["instance_name"].nunique() # b = pdf["instance_name"].astype(np.int64).min() # c = pdf["instance_name"].astype(np.int64).max() # d = pdf["instance_num"].min() # e = pdf["instance_num"].max() # f = pdf["instance_name"].count() # if d != e or b < 0 or c >= e or a != d or a != f: # print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f)) # # df.groupby("task_name").apply(check) for row in df.itertuples(index=False): if None in row: print(row, flush=True) task_name = row.task_name instance_name = str(row.instance_name) memory_requested = row.plan_mem resources_requested = row.plan_cpu resource_id = row.machine_id splits = task_name.split("_") if splits[0] == "task": cleaned_task_name = splits[1] task_type = "bag" raw_parents = [] else: cleaned_task_name = splits[0][1:] task_type = str(splits[0][0]) raw_parents = [x for x in splits[1:] if x.isdigit()] if resource_id is None: resource_id = -1 else: resource_id = mmh3.hash64(row.machine_id)[1] if row.end_time is None or math.isnan(row.end_time): invalid_task_raw_ids.add(cleaned_task_name) continue if row.start_time is None or math.isnan(row.start_time): invalid_task_raw_ids.add(cleaned_task_name) continue if memory_requested is None or math.isnan(memory_requested): memory_requested = -1 if resources_requested is None or math.isnan(resources_requested): avg_cpu = row.cpu_avg if avg_cpu is None or math.isnan(avg_cpu): invalid_task_raw_ids.add(cleaned_task_name) continue else: resources_requested = avg_cpu this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name + "@" + instance_name)[1] if cleaned_task_name not in raw_id_to_instances: raw_id_to_instances[cleaned_task_name] = row.instance_num if row.instance_num > 10: # Create parent and child tasks raw_parent_id = cleaned_task_name + "_p" parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + "0")[1] if parent_task_id not in tasks: tasks[parent_task_id] = Task( id=parent_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=raw_parents, workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1) raw_id_to_instances[raw_parent_id] = 1 raw_child_id = cleaned_task_name + "_c" child_task_id = mmh3.hash64(job_name + "@" + raw_child_id + "@" + "0")[1] if child_task_id not in tasks: tasks[child_task_id] = Task( id=child_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=[cleaned_task_name], workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1, params="child") raw_id_to_instances[raw_child_id] = 1 raw_parents = [raw_parent_id] this_task = Task( id=this_task_id, type=task_type, submission_site=0, runtime=(row.end_time - row.start_time) * 1000, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=resources_requested, parents=raw_parents, workflow_id=workflow_id, params=task_name + " $ " + instance_name + " $ " + str(row.instance_num) + " $ " + job_name, wait_time=0, resource_type='core', resource=resource_id, memory_requested=memory_requested) tasks[this_task_id] = this_task for task_id, task in tasks.items(): task.parents = [ p for p in task.parents if p not in invalid_task_raw_ids ] parents = [] for raw_parent_id in task.parents: # If previous wave has a child and this task is not that child. # refer to the child instead of the wave. if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child": raw_parent_id = raw_parent_id + "_c" # We might hit an edge case where a parent was not recorded by the system of Alibaba # (e.g. bug or the tracing stopped) if raw_parent_id not in raw_id_to_instances: continue parent_instances = raw_id_to_instances[raw_parent_id] proper_parent_ids = [] for x in range(parent_instances): # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data # So we need to check if they are actually encountered. hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + str(x))[1] if hash in tasks: proper_parent_ids.append(hash) parents.extend(proper_parent_ids) for proper_id in proper_parent_ids: tasks[proper_id].children.add(task_id) # task.params = None task.parents = parents # ze_best = pd.concat(pandas_dataframes) parquet_dicts = [task.get_parquet_dict() for task in tasks.values()] if len(tasks) > 0: ret = pd.DataFrame(parquet_dicts) else: # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom. ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys()) return ret @F.pandas_udf(returnType=Task.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def container_to_task(df): row = df.iloc[0, :] start_time = df["time_stamp"].min() * 1000 stop_time = df["time_stamp"].max() * 1000 task_id = mmh3.hash64(row["container_id"])[1] workflow_id = mmh3.hash64(row["app_du"])[1] task = Task( id=task_id, type="long running", parents=[], ts_submit= start_time, # We convert time from seconds to milliseconds. submission_site=0, runtime=(start_time - stop_time), resource_amount_requested=row["cpu_request"], memory_requested=row["mem_size"], workflow_id=workflow_id, wait_time=0, resource=mmh3.hash64(row["machine_id"])[1]) return pd.DataFrame([task.get_parquet_dict()]) if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): # Rename instances # This allows instance names to be derived using just the task name and number of instances of the task. task_window = Window.partitionBy("job_name", "task_name").orderBy("start_time") # Subtracting 1 becasue row number starts at 0. Makes later iteration more intuitive. # We are using instance name as an index in a particular job and task. instances_renamed = batch_instance.withColumn( "instance_name", (F.row_number().over(task_window) - F.lit(1)).cast(StringType())) tasks_unconverted = instances_renamed.join( batch_task.select("job_name", "task_name", "instance_num", "plan_cpu", "plan_mem"), on=["job_name", "task_name"], how="inner") # 100% this line is the issue. tasks_converted = tasks_unconverted.groupby("job_name").apply( clean_tasks_of_workflow) # if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): # tasks_converted.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite") long_running_tasks = container_meta.groupBy("container_id").apply( container_to_task) all_tasks = tasks_converted.union(long_running_tasks).dropna() try: all_tasks.printSchema() all_tasks.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite") except Exception as e: print(e, flush=True) raise e @F.pandas_udf(returnType=TaskState.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def task_states_from_instances(df): task_states = [] workflow_id = mmh3.hash64(df.loc[0, "job_name"])[1] for index, row in df.iterrows(): job_name = row["job_name"] task_name = row["task_name"] instance_name = row["instance_name"] splits = task_name.split("_") just_task_name = splits[0][ 1:] # The first letter is irrelevant as it corresponds to nature of task (map or reduce) # and has nothing to do with the structure of the workflow. this_task_id = mmh3.hash64(job_name + "@" + just_task_name + "@" + instance_name)[1] this_task_state = TaskState(ts_start=row["start_time"] * 1000, ts_end=row["end_time"] * 1000, workflow_id=workflow_id, task_id=this_task_id, resource_id=mmh3.hash64( row["machine_id"])[1], cpu_rate=row["cpu_avg"], canonical_memory_usage=row["mem_avg"], maximum_cpu_rate=row["cpu_max"], maximum_memory_usage=row["mem_max"]) if None in this_task_state.get_parquet_dict().values() or np.isnan( this_task_state.get_parquet_dict().values()): print(this_task_state.get_parquet_dict()) raise RuntimeError(this_task_state.get_parquet_dict()) task_states.append(this_task_state.get_parquet_dict()) return pd.DataFrame(task_states) @F.pandas_udf(returnType=TaskState.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def task_states_from_container_usage(df): machine_id = mmh3.hash64(df.loc[0, "machine_id"])[1] def convert(cont_df): task_states = [] prev_end_time = cont_df.loc[0, "start_time"] * 1000 container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1] app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1] sorted_df = df.sort_values("time_stamp") for index, row in sorted_df.iterrows(): this_end_time = row["time_stamp"] * 1000 this_task_state = TaskState( ts_start=prev_end_time, ts_end=this_end_time, workflow_id=app_id, task_id=container_id, resource_id=machine_id, cpu_rate=row["cpu_util_percent"], canonical_memory_usage=row["mem_util_percent"], maximum_disk_bandwidth=row["disk_io_percent"], network_in=row["net_in"], network_out=row["net_out"]) prev_end_time = this_end_time task_states.append(this_task_state.get_parquet_dict()) if None in this_task_state.get_parquet_dict().values( ) or np.isnan(this_task_state.get_parquet_dict().values()): print(this_task_state.get_parquet_dict()) raise ArithmeticError(this_task_state.get_parquet_dict()) return pd.DataFrame(task_states) return df.groupby("container_id").apply(convert).reset_index( drop=True).fillna(-1) # Now, derive workflows from tasks @F.pandas_udf(returnType=Workflow.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): tasks = [] for index, row in df.iterrows(): this_task = Task( id=row["id"], type=row["type"], ts_submit=row["ts_submit"], # We convert time from seconds to milliseconds. submission_site=0, runtime=row["runtime"], resource_amount_requested=row["resource_amount_requested"], memory_requested=row["memory_requested"], parents=row["parents"], workflow_id=row["workflow_id"], wait_time=row["wait_time"], resource=row["resource_used"]) # print(this_task.get_parquet_dict()) tasks.append(this_task) workflow = Workflow(id=df.loc[0, "workflow_id"], ts_submit=df["ts_submit"].min(), tasks=tasks, scheduler_description="Fuxi", workflow_domain="Industrial", workflow_application_name="MapReduce", workflow_appliation_field="Internet Services") try: workflow.compute_critical_path() except toposort.CircularDependencyError: # TODO: Some have cyclic dependencies. Check if this is us, or the data (again) pass return pd.DataFrame([workflow.get_parquet_dict()]) if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): tasks_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path()) ) # Spark doesn't understand it can now read from files, so tell him workflow_df = tasks_df.groupBy("workflow_id").apply( compute_workflow_stats) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") def machine_meta_to_resources(row): resource = Resource( id=mmh3.hash64(row["machine_id"])[1], type="cpu", num_resources=float(row["cpu_num"]), memory=row["mem_size"], ) resource_dict = resource.get_json_dict() del resource_dict["events"] return SparkRow(**resource_dict) if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\n Start parsing Resource DF\n ######") resource_df = machine_meta.rdd.map(machine_meta_to_resources).toDF( Resource.get_spark_type()) resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite", compression="snappy") print("######\n Start parsing Workload\n ######") if "tasks_df" not in locals(): tasks_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path()) ) # Spark doesn't understand it can now read from parquet files, so tell him json_dict = Workload.get_json_dict_from_spark_task_dataframe( tasks_df, domain="Industrial", authors=["Alibaba 2018"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default)) print("######\n Done parsing Workload\n ######")
def parse_askalon_file(askalon_file): os.makedirs(TARGET_DIR, exist_ok=True) workflow_index = 0 invalid_workflow_count = 0 workflow_start = None invalid_workflow = False final_task_list = [] final_taskstate_list = [] final_workflow_list = [] tasks = [] task_by_id = dict() task_state_list = [] with open(askalon_file, 'r') as asklon_trace: for line in asklon_trace.readlines(): if line.startswith('#'): if not line.startswith('# Started:'): continue workflow_date = re.search('# Started: (.+),', line).group(1) if int( workflow_date[-4:] ) == 1970: # filter out "# Started: Thu Jan 01 01:00:00 CET 1970, and did not finish (yet)" invalid_workflow = True continue if workflow_date.find('CEST ') >= 0: timezone_diff = 7200 workflow_date = workflow_date.replace('CEST ', '') elif workflow_date.find('CET') >= 0: timezone_diff = 3600 workflow_date = workflow_date.replace('CET ', '') else: raise Exception( 'Line "{}"" does not follow expected CEST or CET format' .format(line)) # Create a workflow based on observed tasks before starting a new. if tasks: # Since we cannot trust the logs (our validator proved this) # We will have to add both the children and parents manually to be safe. for t in tasks: for child_id in t.children: task_by_id[child_id].parents.add(t.id) for parent_id in t.parents: task_by_id[parent_id].children.add(t.id) workflow = get_workflow_from_tasks(tasks, workflow_start, askalon_file, workflow_index) final_workflow_list.append(workflow) final_task_list.extend(tasks) final_taskstate_list.extend(task_state_list) workflow_index += 1 tasks = [] task_state_list = [] workflow_start = int( ((datetime.strptime(workflow_date, DATETIME_FORMAT) - EPOCH).total_seconds() - timezone_diff) * 1000) invalid_workflow = False # new workflow begins, reset flag else: if invalid_workflow: # skip reading tasks, advance to the next workflow # print("Found invalid workflow, skipping") invalid_workflow_count += 1 continue task, task_state = task_info_from_line(line, workflow_start, workflow_index) if task: if task.runtime < 0: invalid_workflow = True invalid_workflow_count += 1 tasks = [] task_state_list = [] continue tasks.append(task) task_by_id[task.id] = task task_state_list.append(task_state) else: invalid_workflow = True invalid_workflow_count += 1 tasks = [] task_state_list = [] print(workflow_index, invalid_workflow_count) # Flush the last workflow, if any. if tasks and not invalid_workflow: # Since we cannot trust the logs (our validator proved this) # We will have to add both the children and parents manually to be safe. for t in tasks: for child_id in t.children: task_by_id[child_id].parents.add(t.id) for parent_id in t.parents: task_by_id[parent_id].children.add(t.id) final_task_list.extend(tasks) final_taskstate_list.extend(task_state_list) workflow = get_workflow_from_tasks(tasks, workflow_start, askalon_file, workflow_index) final_workflow_list.append(workflow) workflow_index += 1 task_df = pd.DataFrame([t.get_parquet_dict() for t in final_task_list]) # Offset the first task arriving at 0 (and thus the thus the first workflow) min_ts_submit = task_df["ts_submit"].min() task_df["ts_submit"] = task_df["ts_submit"] - min_ts_submit os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"), engine="pyarrow") # Write task states task_state_df = pd.DataFrame( [ts.get_parquet_dict() for ts in final_taskstate_list]) # offset the times task_state_df["ts_start"] = task_state_df["ts_start"] - min_ts_submit task_state_df["ts_end"] = task_state_df["ts_end"] - min_ts_submit os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), "part.0.parquet"), engine="pyarrow") # Write the workflow dataframe workflow_df = pd.DataFrame( [w.get_parquet_dict() for w in final_workflow_list]) # Also offset workflow_df workflow_df["ts_submit"] = workflow_df["ts_submit"] - min_ts_submit os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") workload_description = "" if "bwa" in askalon_file.lower(): workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome." elif "wien2k" in askalon_file.lower(): workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids." # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain="Engineering", authors=["Radu Prodan", "Alexandru Iosup"], workload_description=workload_description) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))
def parse(path_to_dir): if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .getOrCreate() else: findspark.init(spark_home="<path_to_spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() # Convert times which are in microseconds and do not fit in a long to milliseconds convert_micro_to_milliseconds = F.udf(lambda x: x / 1000) if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\n Start parsing TaskState\n ######") task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( os.path.join(path_to_dir, 'task_usage', '*.csv')) # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( # 'fake_task_usage.csv') oldColumns = task_usage_df.schema.names newColumns = ["ts_start", "ts_end", "workflow_id", "id", "resource_id", "cpu_rate", "memory_consumption", "assigned_memory_usage", "unmapped_page_cache", "total_page_cache", "max_memory_usage", "mean_disk_io_time", "mean_local_disk_space_usage", "max_cpu_rate", "max_disk_io_time", "cycles_per_instruction", "memory_accesses_per_instruction", "sample_portion", "aggregation_type", "sampled_cpu_usage", ] task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_usage_df) # Drop columns with too low level details task_usage_df = task_usage_df.drop('memory_accesses_per_instruction') task_usage_df = task_usage_df.drop('cycles_per_instruction') task_usage_df = task_usage_df.drop('unmapped_page_cache') task_usage_df = task_usage_df.drop('total_page_cache') # Conver the timestamps from micro to milliseconds and cast them to long. task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start'))) task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end'))) task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType())) # Some fields have weird symbols in them, clean those. truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x) task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id'))) task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate'))) # Now that the columns have been sanitized, cast them to the right type task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType())) task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing TaskState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): if 'task_usage_df' not in locals(): task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path())) print("######\n Start parsing Tasks\n ######") task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST", parserLib="univocity").load( os.path.join(path_to_dir, 'task_events', '*.csv')) oldColumns = task_df.schema.names newColumns = ["ts_submit", "missing_info", "workflow_id", "id", "resource_id", "event_type", "user_id", "scheduler", "nfrs", "resources_requested", "memory_requested", "disk_space_request", "machine_restrictions", ] task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_df) task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit'))) task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType())) # Filter tasks that never reached completion task_df.createOrReplaceTempView("task_table") task_df = spark.sql("""WITH filtered_tasks AS ( SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id FROM task_table t1 WHERE t1.event_type IN(0, 1, 4) group by t1.workflow_id, t1.id having count(distinct event_type) = 3 ) SELECT t.* FROM task_table t INNER JOIN filtered_tasks f ON t.id = f.id AND t.workflow_id = f.workflow_id""") task_aggregation_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("type", T.StringType(), True), T.StructField("ts_submit", T.LongType(), True), T.StructField("submission_site", T.LongType(), True), T.StructField("runtime", T.LongType(), True), T.StructField("resource_type", T.StringType(), True), T.StructField("resource_amount_requested", T.DoubleType(), True), T.StructField("parents", T.ArrayType(T.LongType()), True), T.StructField("children", T.ArrayType(T.LongType()), True), T.StructField("user_id", T.LongType(), True), T.StructField("group_id", T.LongType(), True), T.StructField("nfrs", T.StringType(), True), T.StructField("wait_time", T.LongType(), True), T.StructField("params", T.StringType(), True), T.StructField("memory_requested", T.DoubleType(), True), T.StructField("network_io_time", T.DoubleType(), True), T.StructField("disk_space_requested", T.DoubleType(), True), T.StructField("energy_consumption", T.DoubleType(), True), T.StructField("resource_used", T.StringType(), True), ]) # Compute based on the event type @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_aggregated_task_usage_metrics(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True) task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True) task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True) if None in [task_start_time, task_submit_time, task_finish_time]: return None task_resource_request = df['resources_requested'].max(skipna=True) task_memory_request = df['memory_requested'].max(skipna=True) task_priority = df['nfrs'].max(skipna=True) task_disk_space_requested = df['disk_space_request'].max(skipna=True) task_machine_id_list = df.resource_id.unique() task_waittime = int(task_start_time) - int(task_submit_time) task_runtime = int(task_finish_time) - int(task_start_time) def default(o): if isinstance(o, np.int64): return int(o) data_dict = { "workflow_id": task_workflow_id, "id": task_id, "type": "", # Unknown "ts_submit": task_submit_time, "submission_site": -1, # Unknown "runtime": task_runtime, "resource_type": "core", # Fields are called CPU, but they are core count (see Google documentation) "resource_amount_requested": task_resource_request, "parents": [], "children": [], "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0], "group_id": -1, "nfrs": json.dumps({"priority": task_priority}, default=default), "wait_time": task_waittime, "params": "{}", "memory_requested": task_memory_request, "network_io_time": -1, # Unknown "disk_space_requested": task_disk_space_requested, "energy_consumption": -1, # Unknown "resource_used": json.dumps(task_machine_id_list, default=default), } return pd.DataFrame(data_dict, index=[0]) task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics) task_df.explain(True) # Now add disk IO time - This cannot be done in the previous Pandas UDF function as # accessing another dataframe in the apply function is not allowed disk_io_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("disk_io_time", T.DoubleType(), True), ]) @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_disk_io_time(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000 data_dict = { "workflow_id": task_workflow_id, "id": task_id, "disk_io_time": disk_io_time } return pd.DataFrame(data_dict, index=[0]) disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy( ["workflow_id", "id"]).apply(compute_disk_io_time) disk_io_df.explain(True) join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id) task_df = task_df.join(disk_io_df, ["workflow_id", "id"]) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Tasks\n ######") else: task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path())) if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\n Start parsing Resource\n ######") # Parse the machine information in the traces, these should match with the resource_ids in task_usage resources_structtype = T.StructType([ # Using StringTypes as we drop those columns T.StructField("time", T.StringType(), False), T.StructField("id", T.LongType(), False), T.StructField("attribute_name", T.StringType(), False), T.StructField("attribute_value", T.StringType(), False), T.StructField("attribute_deleted", T.StringType(), False), ]) resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv')) resource_df = resource_df.select(["id"]) # Only keep the ID, the rest we do not need. # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al. # in their ATC 2018 article. resource_df = resource_df.withColumn('type', F.lit("core")) resource_df = resource_df.withColumn('num_resources', F.lit(8)) resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona")) resource_df = resource_df.withColumn('memory', F.lit(-1)) resource_df = resource_df.withColumn('disk_space', F.lit(-1)) resource_df = resource_df.withColumn('network', F.lit(-1)) resource_df = resource_df.withColumn('os', F.lit("")) resource_df = resource_df.withColumn('details', F.lit("{}")) # Write the resource_df to the specified location resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Resource\n ######") if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())): print("######\n Start parsing ResourceState\n ######") resource_events_structtype = T.StructType([ T.StructField("timestamp", T.DecimalType(20, 0), False), T.StructField("machine_id", T.LongType(), False), T.StructField("event_type", T.IntegerType(), False), T.StructField("platform_id", T.StringType(), False), T.StructField("available_resources", T.FloatType(), False), T.StructField("available_memory", T.FloatType(), False), ]) resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv')) resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp'))) resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType())) resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1)) # Write the resource_df to the specified location resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing ResourceState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\n Start parsing Workflows\n ######") workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 # We do not know the task dependencies, so -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "" total_resources = df['resource_amount_requested'].sum() # TODO or assigned? total_memory_usage = df['memory_requested'].sum() # TODO or consumption, or assigned? total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) # Create and write the workflow dataframe workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Workflows\n ######") print("######\n Start parsing Workload\n ######") json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df, domain="Industrial", start_date="2011-05-01", end_date="2011-05-30", authors=["Google"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default)) print("######\n Done parsing Workload\n ######")