def machine_meta_to_resources(row): resource = Resource( id=mmh3.hash64(row["machine_id"])[1], type="cpu", num_resources=float(row["cpu_num"]), memory=row["mem_size"], ) resource_dict = resource.get_json_dict() del resource_dict["events"] return SparkRow(**resource_dict)
def generateMap(self, size, resources, spots, repartition, totalRes, delta): totalSize = size[0] * size[1] cases = list(range(0, totalSize)) hqX, hqY = settings.DEFAULT_HQ_POS cases.remove(hqX * hqY) hqX, hqY = settings.DEFAULT_TRANSMITTER_POS cases.remove(hqX * hqY) numpy.random.seed(self._seed) resList = [] for i, res in enumerate(resources): # compute spot number spotNumber = self.getRandomDelta(totalSize * spots[i], delta) amountBySpot = self.getRandomDelta( totalRes * repartition[i], delta) / spotNumber # uniform distribution print(amountBySpot) for j in range(0, spotNumber): position = numpy.random.choice(cases, 1) cases.remove(position) resList.append( Resource( (int(position / size[1]) + settings.BORDER_TILES_NUM, int(position % size[1]) + settings.BORDER_TILES_NUM), (1, 1), res, amountBySpot)) return resList
def parse_resources(c): resources = {} c.execute("SELECT * FROM host") for host_id, wf_id, site, hostname, ip, uname, total_memory in c.fetchall(): details = {} details["ip"] = ip details["hostname"] = hostname resources[host_id] = Resource(host_id, site, -1, -1, total_memory, -1, -1, uname, details) return resources
def __init__(self, **attributes): for attr_name, attr_value in attributes.items(): setattr(self, attr_name, attr_value) # self.nb_machine = nb_machine # self.nb_jobs = nb_jobs # self.problem = problem self.resource_list = [] self.jobs_list = [] # Create Resources for i in range(self.nb_machine): self.resource_list.append(Resource(i)) # Create jobs for i in range(self.nb_jobs): self.jobs_list.append(Job(i, self.problem[i], self.resource_list))
def __init__(self, nom): self.data, self.optimum = loader(name=nom) self.nb_machine = self.data['nb_machine'] self.nb_jobs = self.data['nb_jobs'] self.problem = self.data['problem'] self.nom = nom self.resource_list = [] self.jobs_list = [] self.makeSpan = -1 self.criticalPath = [] self.state = "Not Solved" # Create Resources for i in range(self.nb_machine): self.resource_list.append(Resource(i)) # Create jobs for i in range(self.nb_jobs): self.jobs_list.append(Job(i, self.problem[i], self.resource_list, self))
def parse_and_return_task_dataframe(file_path): global TARGET_DIR with open(file_path) as trace: json_data = json.load(trace) workflow = json_data['workflow'] tasks = workflow['jobs'] machines = workflow['machines'] date = json_data['createdAt'] # Convert the ts_submit to seconds instead of a datetime string task_date = dateparser.parse(date) EPOCH = datetime(1970, 1, 1, tzinfo=task_date.tzinfo) ts_submit = int((task_date - EPOCH).total_seconds() * 1000) resource_by_id = dict() for machine in machines: machine_id = mmh3.hash64("machine:{0}".format(machine['machine_code'].strip()))[0] machine = machine["machine"] num_cpus = machine['cpu']['count'] details = { "cpu_vendor": machine['cpu']['vendor'], "architecture": machine['architecture'] } memory_in_gb = int(machine['memory']) / float(1024 * 1024) res = Resource(machine_id, "cluster_node", num_cpus, machine['release'], memory_in_gb, -1, -1, machine['system'], details) resource_by_id[machine_id] = res task_list = [] task_state_list = [] inputs_per_taskid = dict() outputs_per_taskid = dict() outputs_matched = dict() task_per_taskid = dict() input_file_data_per_task_id = dict() output_file_data_per_task_id = dict() for task in tasks: task_id = mmh3.hash64("task:{}".format(str(task['name']).strip()))[0] print(task_id) task_files = task['files'] if 'files' in task else [] task_type = task['type'] task_cores = task['cores'] if 'cores' in task else 1 task_memory = task['memory'] if 'memory' in task else -1 task_runtime = task['runtime'] * 1000 if 'runtime' in task else -1 task_dependencies = [mmh3.hash64("task:{}".format(str(p).strip()))[0] for p in task['parents']] task_parameters = {"arguments": task['arguments']} if 'arguments' in task else {} task_machine = mmh3.hash64("machine:{0}".format(task['machine'].strip()))[0] if 'machine' in task else None task_resource = resource_by_id[task_machine].id if 'machine' in task else -1 # Convert energy to Wh from KWh task_total_energy_consumption = float(task['energy']) * 1000 if 'energy' in task else -1 t = Task(task_id, task_type, ts_submit, -1, task_runtime, task_cores, task_dependencies, 0, -1, params=task_parameters, resource=task_resource, energy_consumption=task_total_energy_consumption, resource_type="core") task_per_taskid[task_id] = t task_list.append(t) # Parse the data transfers for file_item in task_files: # Apparently not all traces were parsed into version 0.2 despite them being in the # folders for 0.2. To this end we need a check for the file name and size fields. file_name = file_item['name'] if 'name' in file_item else file_item['fileId'] file_size = file_item['size'] if 'size' in file_item else -1 # Store the incoming and outgoing data to this task in separate dicts if file_item['link'] == "input": if task_id not in inputs_per_taskid: inputs_per_taskid[task_id] = set() input_file_data_per_task_id[task_id] = dict() inputs_per_taskid[task_id].add(file_name) try: input_file_data_per_task_id[task_id][file_name] = file_size except: print(file_item) exit(-1) elif file_item['link'] == "output": if task_id not in outputs_per_taskid: outputs_per_taskid[task_id] = set() outputs_matched[task_id] = dict() output_file_data_per_task_id[task_id] = dict() output_file_data_per_task_id[task_id] = dict() outputs_per_taskid[task_id].add(file_name) outputs_matched[task_id][file_name] = False output_file_data_per_task_id[task_id][file_name] = file_size # Create a task state for the entire duration with task_state = TaskState(ts_submit, ts_submit + task_runtime, 0, task_id, -1, canonical_memory_usage=task_memory) task_state_list.append(task_state) # Make sure the earliest task starts at 0. min_ts_submit = min(task.ts_submit for task in task_list) for task in task_list: # Update the time task.ts_submit -= min_ts_submit for parent in task.parents: # Also since we have all parent info, set them in the same loop task_per_taskid[parent].children.add(task.id) # Offset task states too for taskstate in task_state_list: taskstate.ts_start -= min_ts_submit taskstate.ts_end -= min_ts_submit data_transfer_id = 0 # Since tasks can output files with the same name as other tasks, we must loop over a task's parents # and match the output names against input names. for task in task_list: # For every task we have if task.id not in inputs_per_taskid: continue inputs = inputs_per_taskid[task.id] # We loop over the parents (no need to check children, they will come later) for dep in task.parents: outputs = outputs_per_taskid[dep] if dep in outputs_per_taskid else set() overlap = inputs.intersection(outputs) # Check for overlap if len(overlap) > 0: # We have input-output pairs, loop to construct datatransfers for file_name in overlap: # Get the size and construct a datatransfer object. data_size = output_file_data_per_task_id[dep][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id, data_size) # Assign it to the tasks task_per_taskid[dep].datatransfers.append(datatransfer) task.datatransfers.append(datatransfer) outputs_matched[dep][file_name] = True # Remove the file from the input as it's covered. Do NOT remove it from output, # the same output file may be used by another task (fan-out structure). inputs.remove(file_name) data_transfer_id += 1 # Loop over the remaining input files. Since we do not have a source, we assume them are present # on the filesystem beforehand. for file_name in inputs: data_size = input_file_data_per_task_id[task.id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) data_transfer_id += 1 # Loop over the outputs and create a datatransfer for those that are not matched yet # These are likely files with final results, not having an destination. for task_id in outputs_matched.keys(): for file_name in outputs_matched[task_id].keys(): if not outputs_matched[task_id][file_name]: task = task_per_taskid[task_id] data_size = output_file_data_per_task_id[task_id][file_name] datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size) task.datatransfers.append(datatransfer) filename_for_this_partition = "part.0.parquet" # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list]) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), filename_for_this_partition), engine="pyarrow") # Write all task states to parquet os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df = pd.DataFrame([task_state.get_parquet_dict() for task_state in task_state_list]) task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), filename_for_this_partition), engine="pyarrow") # Write all data transfers to parquet if any(len(task.datatransfers) for task in task_list): os.makedirs(os.path.join(TARGET_DIR, Datatransfer.output_path()), exist_ok=True) datatransfer_df = pd.DataFrame( [datatransfer.get_parquet_dict() for task_item in task_list for datatransfer in task_item.datatransfers]) datatransfer_df.to_parquet( os.path.join(TARGET_DIR, Datatransfer.output_path(), filename_for_this_partition), engine="pyarrow") # Write the workflows to parquet wf_agnostic_df = compute_characteristics(task_df) workflow_ts_submit = task_df["ts_submit"].min() # Determine the application name and field application_names = { "epigenomics": ("Epigenomics", "Bioinformatics"), "montage": ("Montage", "Astronomy"), "soykb": ("SoyKB", "Bioinformatics"), } application_name = "" application_field = "" for key in application_names.keys(): if key in file_path: application_name = application_names[key][0] application_field = application_names[key][1] workflow = Workflow(0, workflow_ts_submit, task_list, "Pegasus", "Scientific", application_name, application_field) workflow.compute_critical_path() wf_df = pd.DataFrame([workflow.get_parquet_dict()]) return wf_df
def parse(path_to_dir): global TARGET_DIR TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1]) if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .config("spark.sql.execution.arrow.enabled", "true") \ .getOrCreate() else: findspark.init(spark_home="<path to spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): print("######\nStart parsing Tasks\n######") task_df = spark.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( os.path.join(path_to_dir, '*.csv.processed')) # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable task_df = task_df.drop('pref').filter( task_df.status == ":instance.status/success").drop( 'status').cache() @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def sub_two_datetimes(s1, s2): arr = [] for i in s1.keys(): d1 = datetime.datetime.strptime(s1[i], '%a %b %d %H:%M:%S %Z %Y') d2 = datetime.datetime.strptime(s2[i], '%a %b %d %H:%M:%S %Z %Y') arr.append(int((d2 - d1).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df \ .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \ .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time'))) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def date_time_to_unix(series): arr = [] epoch = datetime.datetime.utcfromtimestamp(0) for i in series.keys(): arr.append( np.int64((datetime.datetime.strptime( series[i], '%a %b %d %H:%M:%S %Z %Y') - epoch).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df.withColumn( 'submit-time', date_time_to_unix(F.col('submit-time'))).withColumnRenamed( 'submit-time', "ts_submit").drop('start-time').drop('end-time').cache() min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0] task_df = task_df.withColumn('ts_submit', F.col('ts_submit') - F.lit(min_ts)) @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR) def convert_to_kb(v): return v * 1024 task_df = task_df.withColumn('memory', convert_to_kb( task_df.memory)).withColumnRenamed("memory", "memory_consumption") @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR) def string_to_int(v): arr = [] for i in v.keys(): arr.append(mmh3.hash(v[i], signed=True)) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def string_to_long(v): arr = [] for i in v.keys(): arr.append(mmh3.hash64(v[i], signed=True)[0]) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def assign_workflow_ids(v): arr = [] for i in v.keys(): if v[i]: arr.append(mmh3.hash64(v[i], signed=True)[0]) else: arr.append( mmh3.hash64(uuid4().bytes, signed=True) [0]) # Assign a UUID, collision chance is negligible. return pd.Series(arr) task_df = task_df.withColumn('user', string_to_int( task_df.user)).withColumnRenamed("user", "user_id") task_df = task_df.withColumn('job-uuid', string_to_long( F.col('job-uuid'))).withColumnRenamed( 'job-uuid', 'task_id') type_udf = F.udf(lambda x: "Independent" if x is None else "Composite", T.StringType()) task_df = task_df.withColumn('type', type_udf(task_df.simset)) task_df = task_df.withColumn('simset', assign_workflow_ids( F.col('simset'))).withColumnRenamed( 'simset', "workflow_id") task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested') task_df = task_df.withColumnRenamed('instance', 'resource_used') # Set the static items that are not present in the trace task_df = task_df.withColumn('submission_site', F.lit(0)) task_df = task_df.withColumn('parents', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('children', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('group_id', F.lit(0)) task_df = task_df.withColumn('nfrs', F.lit("{}")) task_df = task_df.withColumn('params', F.lit("{}")) task_df = task_df.withColumn('memory_requested', F.lit(-1)) task_df = task_df.withColumn('network_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_space_requested', F.lit(-1)) task_df = task_df.withColumn('energy_consumption', F.lit(-1)) os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Tasks\n######") if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\nStart parsing TaskState\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) task_state_structtype = T.StructType([ T.StructField("ts_start", T.LongType(), False), T.StructField("ts_end", T.LongType(), False), T.StructField("workflow_id", T.LongType(), False), T.StructField("task_id", T.LongType(), False), T.StructField("resource_id", T.LongType(), False), T.StructField("cpu_rate", T.DoubleType(), False), T.StructField("canonical_memory_usage", T.DoubleType(), False), T.StructField("assigned_memory", T.DoubleType(), False), T.StructField("minimum_memory_usage", T.DoubleType(), False), T.StructField("maximum_memory_usage", T.DoubleType(), False), T.StructField("disk_io_time", T.DoubleType(), False), T.StructField("maximum_disk_bandwidth", T.DoubleType(), False), T.StructField("local_disk_space_usage", T.DoubleType(), False), T.StructField("maximum_cpu_rate", T.DoubleType(), False), T.StructField("maximum_disk_io_time", T.DoubleType(), False), T.StructField("sample_rate", T.DoubleType(), False), T.StructField("sample_portion", T.DoubleType(), False), T.StructField("sampled_cpu_usage", T.DoubleType(), False), T.StructField("network_io_time", T.DoubleType(), False), T.StructField("maximum_network_bandwidth", T.DoubleType(), False), ]) @F.pandas_udf(returnType=task_state_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_task_states(df): workflow_id = df['workflow_id'].iloc[0] task_id = df['task_id'].iloc[0] ts_start = df['ts_submit'].min() ts_end = ts_start + df['runtime'].max() resource_id = df['resource_used'].iloc[0] cpu_rate = -1 canonical_memory_usage = df['memory_consumption'].mean() assigned_memory = -1 minimum_memory_usage = df['memory_consumption'].min() maximum_memory_usage = df['memory_consumption'].max() disk_io_time = -1 maximum_disk_bandwidth = -1 local_disk_space_usage = -1 maximum_cpu_rate = -1 maximum_disk_io_time = -1 sample_rate = -1 sample_portion = -1 sampled_cpu_usage = -1 network_io_time = -1 maximum_network_bandwidth = -1 data_dict = { "ts_start": ts_start, "ts_end": ts_end, "workflow_id": workflow_id, "task_id": task_id, "resource_id": resource_id, "cpu_rate": cpu_rate, "canonical_memory_usage": canonical_memory_usage, "assigned_memory": assigned_memory, "minimum_memory_usage": minimum_memory_usage, "maximum_memory_usage": maximum_memory_usage, "disk_io_time": disk_io_time, "maximum_disk_bandwidth": maximum_disk_bandwidth, "local_disk_space_usage": local_disk_space_usage, "maximum_cpu_rate": maximum_cpu_rate, "maximum_disk_io_time": maximum_disk_io_time, "sample_rate": sample_rate, "sample_portion": sample_portion, "sampled_cpu_usage": sampled_cpu_usage, "network_io_time": network_io_time, "maximum_network_bandwidth": maximum_network_bandwidth, } return pd.DataFrame(data_dict, index=[0]) task_state_df = task_df.groupBy(['workflow_id', 'task_id']).apply(compute_task_states) os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing TaskState\n######") if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\nStart parsing Resources\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) resource_id_column = [ i.resource_used for i in task_df.select('resource_used').distinct().collect() ] resources = [] for resource_id in resource_id_column: resources.append( Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1, '').get_parquet_dict()) resource_df = pd.DataFrame(resources) os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()), exist_ok=True) resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(), 'part.0.parquet'), engine="pyarrow") print("######\nDone parsing Resources\n######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\nStart parsing Workflows\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "Cook" total_resources = df['resource_amount_requested'].sum() total_memory_usage = df['memory_consumption'].sum() total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) workflow_df = task_df.groupBy('workflow_id').apply( compute_workflow_stats) workflow_df.explain(True) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Workflows\n######") print("######\nStart parsing Ẁorkload\n######") pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()), engine="pyarrow") json_dict = Workload.get_json_dict_from_pandas_task_dataframe( pandas_task_df, domain="Industrial", start_date=None, end_date=None, authors=["Two Sigma"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))
def parse_workflow(wf, filename): workflow_id = string2numeric_hash(wf['name'] + '-(' + wf['id'] + ')') workflow_domain = "" # The domain the workflow belongs to, e.g. industry, science, etc. workflow_application_name = "" # The name of the application, e.g. Montage, SIPHT workflow_appliation_field = "" # The field of the application, e.g. bioinformatics, astronomy if "bwa" in filename.lower(): workflow_id = string2numeric_hash("bwa" + '-(' + wf['id'] + ')') workflow_domain = "science" workflow_application_name = "Burroughs-Wheeler Alignment tool" workflow_appliation_field = "bioinformatics" elif "wien2k" in filename.lower(): workflow_id = string2numeric_hash("wien2k" + '-(' + wf['id'] + ')') workflow_domain = "science" workflow_application_name = "Wien2k" workflow_appliation_field = "materials chemistry" resources = {} for r in wf['resources']: # parse resources for tasks later r_details = r['details'] os = "Linux" details = {} details['provider'] = r_details['provider'] details['instanceType'] = r_details['instanceType'] events = parse_events(r['events']) # id, type, num_resources, proc_model_name, memory, disk_space, network_bandwidth, operating_system, details=None, events=None resources[string2numeric_hash(r['id'])] = Resource( string2numeric_hash(r['name']), r['type'], r_details['vCPUs'], r_details['physicalProcessor'], r_details['memory'], r_details['storage'], r_details['networkPerformance'], os, details, events) # create list of tasks for parents if "wien2k" in filename.lower(): first = [] second = [] third = [] fourth = [] last = [] for t in wf['tasks']: if "first" in t['name'].lower(): first.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "second" in t['name'].lower(): second.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "third" in t['name'].lower(): third.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "fourth" in t['name'].lower(): fourth.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "last" in t['name'].lower(): last.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) elif "bwa" in filename.lower(): bwaindex_split1_2 = [] bwa1aln = [] bwaconcat = [] for t in wf['tasks']: if "bwa:bwaindex" in t['name'].lower(): bwaindex_split1_2.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "bwa:split1" in t['name'].lower(): bwaindex_split1_2.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "bwa:split2" in t['name'].lower(): bwaindex_split1_2.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "bwa:bwa1aln" in t['name'].lower(): bwa1aln.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) if "bwa:concat" in t['name'].lower(): bwaconcat.append( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')'))) tasks = [] for t in wf['tasks']: # parse tasks if "cloud init" not in t['name'].lower( ) and "cloud instances" not in t['name'].lower( ) and "parforiteration" not in t['type'].lower( ) and "parallel" not in t['type'].lower( ) and "section" not in t['type'].lower(): parents = [] if "wien2k" in filename.lower(): if "second" in t['name'].lower(): parents = first if "third" in t['name'].lower(): parents = second if "fourth" in t['name'].lower(): parents = third if "last" in t['name'].lower(): parents = fourth elif "bwa" in filename.lower(): if "bwa:bwa1aln" in t['name'].lower(): parents = bwaindex_split1_2 if "bwa:concat" in t['name'].lower(): parents = bwa1aln # print(parents) submission_site = string2numeric_hash(t['submissionSite']) res = None if submission_site != '': res = resources[submission_site] params = parse_params(t['params']) events = parse_events(t['events']) wait_time = 0 if "ACTIVE" in events: wait_time = int((parse(events["ACTIVE"]) - parse(t['startTime'])).total_seconds() * 1000) # id, type, ts_submit, # submission_site, runtime, resource_amount_requested, parents, # workflow_id, wait_time, resource_type="cpu", resource=None, datatransfer=None, params=None, events=None, requirements=None, user_id=-1, group_id=-1, memory_requested=-1, disk_space_requested=-1, disk_io_time=-1, network_io_time=-1, energy_consumption=-1 tasks.append( Task( string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')), t['type'], int(parse(t['startTime']).timestamp() * 1000), submission_site, int((parse(t['endTime']) - parse(t['startTime'])).total_seconds() * 1000), 1, parents, workflow_id, wait_time, "CPU", res, parse_datatransfers(t['fileTransfers']), params, events)) ts_start = min(t['startTime'] for t in wf['tasks']) # id, ts_submit, tasks, scheduler_description return Workflow(workflow_id, int(parse(wf['beginTime']).timestamp() * 1000), tasks, wf['scheduler'], workflow_domain, workflow_application_name, workflow_appliation_field)
def parse(path_to_dir): global TARGET_DIR TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[-1]) if "DAS5" in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ["HOSTNAME"] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "256G") \ .config("spark.driver.maxResultSize", "40G") \ .config("spark.network.timeout", "100000s") \ .config("spark.rpc.askTimeout", "100000s") \ .config("spark.default.parallelism", "2000") \ .config("spark.sql.execution.arrow.enabled", "true") \ .config("spark.cleaner.periodicGC.interval", "5s") \ .getOrCreate() else: import findspark findspark.init("<path_to_spark>") spark = SparkSession.builder \ .master("local[4]") \ .appName("WTA parser") \ .config("spark.executor.memory", "2G") \ .config("spark.driver.memory", "2G") \ .getOrCreate() machine_meta = spark.read.csv(os.path.join(path_to_dir, "machine_meta.csv"), schema=StructType([ StructField("machine_id", StringType(), True), StructField("time_stamp", LongType(), True), StructField("failure_domain_1", LongType(), True), StructField("failure_domain_2", StringType(), True), StructField("cpu_num", LongType(), True), StructField("mem_size", LongType(), True), StructField("status", StringType(), True) ])) machine_usage = spark.read.csv(os.path.join(path_to_dir, "machine_usage.csv"), schema=StructType([ StructField("machine_id", StringType(), True), StructField("time_stamp", DoubleType(), True), StructField("cpu_util_percent", LongType(), True), StructField("mem_util_percent", LongType(), True), StructField("mem_gps", DoubleType(), True), StructField("mkpi", LongType(), True), StructField("net_in", DoubleType(), True), StructField("net_out", DoubleType(), True), StructField("disk_io_percent", DoubleType(), True) ])) container_meta = spark.read.csv( os.path.join(path_to_dir, "container_meta.csv"), schema=StructType([ StructField("container_id", StringType(), True), StructField("machine_id", StringType(), True), StructField("time_stamp", LongType(), True), StructField("app_du", StringType(), True), StructField("status", StringType(), True), StructField("cpu_request", LongType(), True), StructField("cpu_limit", LongType(), True), StructField("mem_size", DoubleType(), True) ])) container_usage = spark.read.csv(os.path.join(path_to_dir, "container_usage.csv"), schema=StructType([ StructField("container_id", StringType(), True), StructField("machine_id", StringType(), True), StructField("time_stamp", DoubleType(), True), StructField("cpu_util_percent", LongType(), True), StructField("mem_util_percent", LongType(), True), StructField("cpi", DoubleType(), True), StructField("mem_gps", DoubleType(), True), StructField("mpki", LongType(), True), StructField("net_in", DoubleType(), True), StructField("net_out", DoubleType(), True), StructField("disk_io_percent", DoubleType(), True) ])) batch_task = spark.read.csv(os.path.join(path_to_dir, "batch_task.csv"), schema=StructType([ StructField("task_name", StringType(), True), StructField("instance_num", LongType(), True), StructField("job_name", StringType(), True), StructField("task_type", StringType(), True), StructField("status", StringType(), True), StructField("start_time", LongType(), True), StructField("end_time", LongType(), True), StructField("plan_cpu", DoubleType(), True), StructField("plan_mem", DoubleType(), True) ])) batch_instance = spark.read.csv( os.path.join(path_to_dir, "batch_instance.csv"), schema=StructType([ StructField("instance_name", StringType(), True), StructField("task_name", StringType(), True), StructField("job_name", StringType(), True), StructField("task_type", StringType(), True), StructField("status", StringType(), True), StructField("start_time", LongType(), True), StructField("end_time", LongType(), True), StructField("machine_id", StringType(), True), StructField("seq_no", LongType(), True), StructField("total_seq_no", LongType(), True), StructField("cpu_avg", DoubleType(), True), StructField("cpu_max", DoubleType(), True), StructField("mem_avg", DoubleType(), True), StructField("mem_max", DoubleType(), True) ])) @F.pandas_udf(returnType=Task.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def clean_tasks_of_workflow(df): tasks = dict() raw_id_to_instances = dict() job_name = df.loc[0, "job_name"] workflow_id = mmh3.hash64(job_name)[1] invalid_task_raw_ids = set() # group by task name # - count number of instances # - compare with row.instance_num # Check to inspect if the data is noisy # def check(pdf): # a = pdf["instance_name"].nunique() # b = pdf["instance_name"].astype(np.int64).min() # c = pdf["instance_name"].astype(np.int64).max() # d = pdf["instance_num"].min() # e = pdf["instance_num"].max() # f = pdf["instance_name"].count() # if d != e or b < 0 or c >= e or a != d or a != f: # print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f)) # # df.groupby("task_name").apply(check) for row in df.itertuples(index=False): if None in row: print(row, flush=True) task_name = row.task_name instance_name = str(row.instance_name) memory_requested = row.plan_mem resources_requested = row.plan_cpu resource_id = row.machine_id splits = task_name.split("_") if splits[0] == "task": cleaned_task_name = splits[1] task_type = "bag" raw_parents = [] else: cleaned_task_name = splits[0][1:] task_type = str(splits[0][0]) raw_parents = [x for x in splits[1:] if x.isdigit()] if resource_id is None: resource_id = -1 else: resource_id = mmh3.hash64(row.machine_id)[1] if row.end_time is None or math.isnan(row.end_time): invalid_task_raw_ids.add(cleaned_task_name) continue if row.start_time is None or math.isnan(row.start_time): invalid_task_raw_ids.add(cleaned_task_name) continue if memory_requested is None or math.isnan(memory_requested): memory_requested = -1 if resources_requested is None or math.isnan(resources_requested): avg_cpu = row.cpu_avg if avg_cpu is None or math.isnan(avg_cpu): invalid_task_raw_ids.add(cleaned_task_name) continue else: resources_requested = avg_cpu this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name + "@" + instance_name)[1] if cleaned_task_name not in raw_id_to_instances: raw_id_to_instances[cleaned_task_name] = row.instance_num if row.instance_num > 10: # Create parent and child tasks raw_parent_id = cleaned_task_name + "_p" parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + "0")[1] if parent_task_id not in tasks: tasks[parent_task_id] = Task( id=parent_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=raw_parents, workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1) raw_id_to_instances[raw_parent_id] = 1 raw_child_id = cleaned_task_name + "_c" child_task_id = mmh3.hash64(job_name + "@" + raw_child_id + "@" + "0")[1] if child_task_id not in tasks: tasks[child_task_id] = Task( id=child_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=[cleaned_task_name], workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1, params="child") raw_id_to_instances[raw_child_id] = 1 raw_parents = [raw_parent_id] this_task = Task( id=this_task_id, type=task_type, submission_site=0, runtime=(row.end_time - row.start_time) * 1000, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=resources_requested, parents=raw_parents, workflow_id=workflow_id, params=task_name + " $ " + instance_name + " $ " + str(row.instance_num) + " $ " + job_name, wait_time=0, resource_type='core', resource=resource_id, memory_requested=memory_requested) tasks[this_task_id] = this_task for task_id, task in tasks.items(): task.parents = [ p for p in task.parents if p not in invalid_task_raw_ids ] parents = [] for raw_parent_id in task.parents: # If previous wave has a child and this task is not that child. # refer to the child instead of the wave. if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child": raw_parent_id = raw_parent_id + "_c" # We might hit an edge case where a parent was not recorded by the system of Alibaba # (e.g. bug or the tracing stopped) if raw_parent_id not in raw_id_to_instances: continue parent_instances = raw_id_to_instances[raw_parent_id] proper_parent_ids = [] for x in range(parent_instances): # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data # So we need to check if they are actually encountered. hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + str(x))[1] if hash in tasks: proper_parent_ids.append(hash) parents.extend(proper_parent_ids) for proper_id in proper_parent_ids: tasks[proper_id].children.add(task_id) # task.params = None task.parents = parents # ze_best = pd.concat(pandas_dataframes) parquet_dicts = [task.get_parquet_dict() for task in tasks.values()] if len(tasks) > 0: ret = pd.DataFrame(parquet_dicts) else: # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom. ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys()) return ret @F.pandas_udf(returnType=Task.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def container_to_task(df): row = df.iloc[0, :] start_time = df["time_stamp"].min() * 1000 stop_time = df["time_stamp"].max() * 1000 task_id = mmh3.hash64(row["container_id"])[1] workflow_id = mmh3.hash64(row["app_du"])[1] task = Task( id=task_id, type="long running", parents=[], ts_submit= start_time, # We convert time from seconds to milliseconds. submission_site=0, runtime=(start_time - stop_time), resource_amount_requested=row["cpu_request"], memory_requested=row["mem_size"], workflow_id=workflow_id, wait_time=0, resource=mmh3.hash64(row["machine_id"])[1]) return pd.DataFrame([task.get_parquet_dict()]) if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): # Rename instances # This allows instance names to be derived using just the task name and number of instances of the task. task_window = Window.partitionBy("job_name", "task_name").orderBy("start_time") # Subtracting 1 becasue row number starts at 0. Makes later iteration more intuitive. # We are using instance name as an index in a particular job and task. instances_renamed = batch_instance.withColumn( "instance_name", (F.row_number().over(task_window) - F.lit(1)).cast(StringType())) tasks_unconverted = instances_renamed.join( batch_task.select("job_name", "task_name", "instance_num", "plan_cpu", "plan_mem"), on=["job_name", "task_name"], how="inner") # 100% this line is the issue. tasks_converted = tasks_unconverted.groupby("job_name").apply( clean_tasks_of_workflow) # if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): # tasks_converted.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite") long_running_tasks = container_meta.groupBy("container_id").apply( container_to_task) all_tasks = tasks_converted.union(long_running_tasks).dropna() try: all_tasks.printSchema() all_tasks.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite") except Exception as e: print(e, flush=True) raise e @F.pandas_udf(returnType=TaskState.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def task_states_from_instances(df): task_states = [] workflow_id = mmh3.hash64(df.loc[0, "job_name"])[1] for index, row in df.iterrows(): job_name = row["job_name"] task_name = row["task_name"] instance_name = row["instance_name"] splits = task_name.split("_") just_task_name = splits[0][ 1:] # The first letter is irrelevant as it corresponds to nature of task (map or reduce) # and has nothing to do with the structure of the workflow. this_task_id = mmh3.hash64(job_name + "@" + just_task_name + "@" + instance_name)[1] this_task_state = TaskState(ts_start=row["start_time"] * 1000, ts_end=row["end_time"] * 1000, workflow_id=workflow_id, task_id=this_task_id, resource_id=mmh3.hash64( row["machine_id"])[1], cpu_rate=row["cpu_avg"], canonical_memory_usage=row["mem_avg"], maximum_cpu_rate=row["cpu_max"], maximum_memory_usage=row["mem_max"]) if None in this_task_state.get_parquet_dict().values() or np.isnan( this_task_state.get_parquet_dict().values()): print(this_task_state.get_parquet_dict()) raise RuntimeError(this_task_state.get_parquet_dict()) task_states.append(this_task_state.get_parquet_dict()) return pd.DataFrame(task_states) @F.pandas_udf(returnType=TaskState.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def task_states_from_container_usage(df): machine_id = mmh3.hash64(df.loc[0, "machine_id"])[1] def convert(cont_df): task_states = [] prev_end_time = cont_df.loc[0, "start_time"] * 1000 container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1] app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1] sorted_df = df.sort_values("time_stamp") for index, row in sorted_df.iterrows(): this_end_time = row["time_stamp"] * 1000 this_task_state = TaskState( ts_start=prev_end_time, ts_end=this_end_time, workflow_id=app_id, task_id=container_id, resource_id=machine_id, cpu_rate=row["cpu_util_percent"], canonical_memory_usage=row["mem_util_percent"], maximum_disk_bandwidth=row["disk_io_percent"], network_in=row["net_in"], network_out=row["net_out"]) prev_end_time = this_end_time task_states.append(this_task_state.get_parquet_dict()) if None in this_task_state.get_parquet_dict().values( ) or np.isnan(this_task_state.get_parquet_dict().values()): print(this_task_state.get_parquet_dict()) raise ArithmeticError(this_task_state.get_parquet_dict()) return pd.DataFrame(task_states) return df.groupby("container_id").apply(convert).reset_index( drop=True).fillna(-1) # Now, derive workflows from tasks @F.pandas_udf(returnType=Workflow.get_spark_type(), functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): tasks = [] for index, row in df.iterrows(): this_task = Task( id=row["id"], type=row["type"], ts_submit=row["ts_submit"], # We convert time from seconds to milliseconds. submission_site=0, runtime=row["runtime"], resource_amount_requested=row["resource_amount_requested"], memory_requested=row["memory_requested"], parents=row["parents"], workflow_id=row["workflow_id"], wait_time=row["wait_time"], resource=row["resource_used"]) # print(this_task.get_parquet_dict()) tasks.append(this_task) workflow = Workflow(id=df.loc[0, "workflow_id"], ts_submit=df["ts_submit"].min(), tasks=tasks, scheduler_description="Fuxi", workflow_domain="Industrial", workflow_application_name="MapReduce", workflow_appliation_field="Internet Services") try: workflow.compute_critical_path() except toposort.CircularDependencyError: # TODO: Some have cyclic dependencies. Check if this is us, or the data (again) pass return pd.DataFrame([workflow.get_parquet_dict()]) if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): tasks_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path()) ) # Spark doesn't understand it can now read from files, so tell him workflow_df = tasks_df.groupBy("workflow_id").apply( compute_workflow_stats) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") def machine_meta_to_resources(row): resource = Resource( id=mmh3.hash64(row["machine_id"])[1], type="cpu", num_resources=float(row["cpu_num"]), memory=row["mem_size"], ) resource_dict = resource.get_json_dict() del resource_dict["events"] return SparkRow(**resource_dict) if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\n Start parsing Resource DF\n ######") resource_df = machine_meta.rdd.map(machine_meta_to_resources).toDF( Resource.get_spark_type()) resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite", compression="snappy") print("######\n Start parsing Workload\n ######") if "tasks_df" not in locals(): tasks_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path()) ) # Spark doesn't understand it can now read from parquet files, so tell him json_dict = Workload.get_json_dict_from_spark_task_dataframe( tasks_df, domain="Industrial", authors=["Alibaba 2018"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default)) print("######\n Done parsing Workload\n ######")
def parse(path_to_dir): if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .getOrCreate() else: findspark.init(spark_home="<path_to_spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() # Convert times which are in microseconds and do not fit in a long to milliseconds convert_micro_to_milliseconds = F.udf(lambda x: x / 1000) if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\n Start parsing TaskState\n ######") task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( os.path.join(path_to_dir, 'task_usage', '*.csv')) # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( # 'fake_task_usage.csv') oldColumns = task_usage_df.schema.names newColumns = ["ts_start", "ts_end", "workflow_id", "id", "resource_id", "cpu_rate", "memory_consumption", "assigned_memory_usage", "unmapped_page_cache", "total_page_cache", "max_memory_usage", "mean_disk_io_time", "mean_local_disk_space_usage", "max_cpu_rate", "max_disk_io_time", "cycles_per_instruction", "memory_accesses_per_instruction", "sample_portion", "aggregation_type", "sampled_cpu_usage", ] task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_usage_df) # Drop columns with too low level details task_usage_df = task_usage_df.drop('memory_accesses_per_instruction') task_usage_df = task_usage_df.drop('cycles_per_instruction') task_usage_df = task_usage_df.drop('unmapped_page_cache') task_usage_df = task_usage_df.drop('total_page_cache') # Conver the timestamps from micro to milliseconds and cast them to long. task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start'))) task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end'))) task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType())) # Some fields have weird symbols in them, clean those. truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x) task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id'))) task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate'))) # Now that the columns have been sanitized, cast them to the right type task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType())) task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing TaskState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): if 'task_usage_df' not in locals(): task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path())) print("######\n Start parsing Tasks\n ######") task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST", parserLib="univocity").load( os.path.join(path_to_dir, 'task_events', '*.csv')) oldColumns = task_df.schema.names newColumns = ["ts_submit", "missing_info", "workflow_id", "id", "resource_id", "event_type", "user_id", "scheduler", "nfrs", "resources_requested", "memory_requested", "disk_space_request", "machine_restrictions", ] task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_df) task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit'))) task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType())) # Filter tasks that never reached completion task_df.createOrReplaceTempView("task_table") task_df = spark.sql("""WITH filtered_tasks AS ( SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id FROM task_table t1 WHERE t1.event_type IN(0, 1, 4) group by t1.workflow_id, t1.id having count(distinct event_type) = 3 ) SELECT t.* FROM task_table t INNER JOIN filtered_tasks f ON t.id = f.id AND t.workflow_id = f.workflow_id""") task_aggregation_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("type", T.StringType(), True), T.StructField("ts_submit", T.LongType(), True), T.StructField("submission_site", T.LongType(), True), T.StructField("runtime", T.LongType(), True), T.StructField("resource_type", T.StringType(), True), T.StructField("resource_amount_requested", T.DoubleType(), True), T.StructField("parents", T.ArrayType(T.LongType()), True), T.StructField("children", T.ArrayType(T.LongType()), True), T.StructField("user_id", T.LongType(), True), T.StructField("group_id", T.LongType(), True), T.StructField("nfrs", T.StringType(), True), T.StructField("wait_time", T.LongType(), True), T.StructField("params", T.StringType(), True), T.StructField("memory_requested", T.DoubleType(), True), T.StructField("network_io_time", T.DoubleType(), True), T.StructField("disk_space_requested", T.DoubleType(), True), T.StructField("energy_consumption", T.DoubleType(), True), T.StructField("resource_used", T.StringType(), True), ]) # Compute based on the event type @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_aggregated_task_usage_metrics(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True) task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True) task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True) if None in [task_start_time, task_submit_time, task_finish_time]: return None task_resource_request = df['resources_requested'].max(skipna=True) task_memory_request = df['memory_requested'].max(skipna=True) task_priority = df['nfrs'].max(skipna=True) task_disk_space_requested = df['disk_space_request'].max(skipna=True) task_machine_id_list = df.resource_id.unique() task_waittime = int(task_start_time) - int(task_submit_time) task_runtime = int(task_finish_time) - int(task_start_time) def default(o): if isinstance(o, np.int64): return int(o) data_dict = { "workflow_id": task_workflow_id, "id": task_id, "type": "", # Unknown "ts_submit": task_submit_time, "submission_site": -1, # Unknown "runtime": task_runtime, "resource_type": "core", # Fields are called CPU, but they are core count (see Google documentation) "resource_amount_requested": task_resource_request, "parents": [], "children": [], "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0], "group_id": -1, "nfrs": json.dumps({"priority": task_priority}, default=default), "wait_time": task_waittime, "params": "{}", "memory_requested": task_memory_request, "network_io_time": -1, # Unknown "disk_space_requested": task_disk_space_requested, "energy_consumption": -1, # Unknown "resource_used": json.dumps(task_machine_id_list, default=default), } return pd.DataFrame(data_dict, index=[0]) task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics) task_df.explain(True) # Now add disk IO time - This cannot be done in the previous Pandas UDF function as # accessing another dataframe in the apply function is not allowed disk_io_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("disk_io_time", T.DoubleType(), True), ]) @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_disk_io_time(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000 data_dict = { "workflow_id": task_workflow_id, "id": task_id, "disk_io_time": disk_io_time } return pd.DataFrame(data_dict, index=[0]) disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy( ["workflow_id", "id"]).apply(compute_disk_io_time) disk_io_df.explain(True) join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id) task_df = task_df.join(disk_io_df, ["workflow_id", "id"]) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Tasks\n ######") else: task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path())) if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\n Start parsing Resource\n ######") # Parse the machine information in the traces, these should match with the resource_ids in task_usage resources_structtype = T.StructType([ # Using StringTypes as we drop those columns T.StructField("time", T.StringType(), False), T.StructField("id", T.LongType(), False), T.StructField("attribute_name", T.StringType(), False), T.StructField("attribute_value", T.StringType(), False), T.StructField("attribute_deleted", T.StringType(), False), ]) resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv')) resource_df = resource_df.select(["id"]) # Only keep the ID, the rest we do not need. # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al. # in their ATC 2018 article. resource_df = resource_df.withColumn('type', F.lit("core")) resource_df = resource_df.withColumn('num_resources', F.lit(8)) resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona")) resource_df = resource_df.withColumn('memory', F.lit(-1)) resource_df = resource_df.withColumn('disk_space', F.lit(-1)) resource_df = resource_df.withColumn('network', F.lit(-1)) resource_df = resource_df.withColumn('os', F.lit("")) resource_df = resource_df.withColumn('details', F.lit("{}")) # Write the resource_df to the specified location resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Resource\n ######") if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())): print("######\n Start parsing ResourceState\n ######") resource_events_structtype = T.StructType([ T.StructField("timestamp", T.DecimalType(20, 0), False), T.StructField("machine_id", T.LongType(), False), T.StructField("event_type", T.IntegerType(), False), T.StructField("platform_id", T.StringType(), False), T.StructField("available_resources", T.FloatType(), False), T.StructField("available_memory", T.FloatType(), False), ]) resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv')) resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp'))) resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType())) resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1)) # Write the resource_df to the specified location resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing ResourceState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\n Start parsing Workflows\n ######") workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 # We do not know the task dependencies, so -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "" total_resources = df['resource_amount_requested'].sum() # TODO or assigned? total_memory_usage = df['memory_requested'].sum() # TODO or consumption, or assigned? total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) # Create and write the workflow dataframe workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Workflows\n ######") print("######\n Start parsing Workload\n ######") json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df, domain="Industrial", start_date="2011-05-01", end_date="2011-05-30", authors=["Google"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default)) print("######\n Done parsing Workload\n ######")