def compute_workflow_features(task_group): wf_agnostic_df = compute_characteristics(task_group) tasks = [] for row in task_group.itertuples(): task = Task( getattr(row, "id"), "Composite", getattr(row, "ts_submit"), 0, # There is just one submissions site getattr(row, "runtime"), getattr(row, "resource_amount_requested"), getattr(row, "parents"), task_group['workflow_id'].iloc[0], -1, "thread", resource=-1, ) task.children = getattr(row, "children") tasks.append(task) workflow_ts_submit = task_group["ts_submit"].min() workflow = Workflow(0, workflow_ts_submit, tasks, "ANANKE", "Industrial", "Chronos", "IoT") workflow.compute_critical_path() wf_df = pd.concat([ wf_agnostic_df, pd.DataFrame({ "id": tasks[0].workflow_id, "ts_submit": pd.Series([np.int64(workflow_ts_submit)], dtype=np.int64), "critical_path_length": pd.Series([np.int32(workflow.critical_path_length)], dtype=np.int32), "critical_path_task_count": pd.Series([np.int32(workflow.critical_path_task_count)], dtype=np.int32), "approx_max_concurrent_tasks": pd.Series([np.int32(workflow.max_concurrent_tasks)], dtype=np.int32), "scheduler": pd.Series([np.str("ANANKE")], dtype=np.str), }) ], axis=1) wf_df["nfrs"] = np.str("{}") wf_df = wf_df[sorted(wf_df.columns)] return wf_df
def parse_jobs(c, wf_id): child_dict, parent_dict = parse_job_instance_dependencies(c, wf_id) resources = parse_resources(c) tasks = [] c.execute( "SELECT job_instance_id,job_instance.job_id,host_id,site,user, exec_job_id,submit_file,type_desc,clustered,max_retries,executable,argv,task_count FROM job_instance JOIN job ON job_instance.job_id=job.job_id WHERE job.wf_id=?", (str(wf_id),)) for job_instance_id, job_id, host_id, site, user, exec_job_id, submit_file, type_desc, clustered, max_retries, executable, argv, task_count in c.fetchall(): events = parse_events_by_job_instance_id(c, job_instance_id) # task execution time runtime = (events["JOB_TERMINATED"] - events["EXECUTE"]) * 1000 # waiting time between submission and execution waittime = (events["EXECUTE"] - events["SUBMIT"]) * 1000 # id, type, ts_submit, submission_site, runtime, resource_amount_requested, parents, workflow_id, wait_time, resource_type="cpu", resource=None, datatransfer=None, params=None, events=None, requirements=None, user_id=-1, group_id=-1, memory_requested=-1, disk_space_requested=-1, disk_io_time=-1, network_io_time=-1, energy_consumption=-1 task = Task(job_instance_id, type_desc, events["SUBMIT"], string2numeric_hash(site), runtime, 1, parent_dict.get(job_instance_id,[]), wf_id, waittime, "CPU",resources.get(host_id, None), events=events, user_id=string2numeric_hash(user)) task.children = child_dict.get(job_instance_id, []) tasks.append(task) return tasks
def task_info_from_line(line, workflow_start, workflow_index): cols = line.split('\t') workflow_id = mmh3.hash64("workflow:{}".format(workflow_index))[0] task_id = mmh3.hash64("workflow:{}_task:{}".format(workflow_index, int(cols[0])))[0] task_submit_time = int(cols[1]) wait_time = float(cols[2]) # Weird bug in the trace, sometimes the task submit time is -1, but then the # wait time is the unix timestamp... if task_submit_time == -1 and wait_time > 1000000000000: task_submit_time = wait_time wait_time = -1 submit_time = workflow_start - task_submit_time if not submit_time: return None, None site_id = mmh3.hash64("site:{}".format(str(cols[16]).strip()))[0] run_time = int(cols[3]) n_procs = int(cols[4]) req_n_procs = int(n_procs) used_memory = float(cols[6]) used_network = float(cols[20]) disk_space_used = float(cols[21]) user_id = mmh3.hash64("user:{}".format(str(cols[11]).strip()))[0] group_id = mmh3.hash64("group:{}".format(str(cols[12]).strip()))[0] job_structure = cols[19] match = re.search('DAGPrev=([\d,]+);?', job_structure) if not match: dependencies = set() else: dependency_string = match.group(1) dependencies = set( mmh3.hash64("workflow:{}_task:{}".format(workflow_index, int(dep))) [0] for dep in dependency_string.split(",")) task = Task(task_id, "composite", submit_time, site_id, run_time, max(n_procs, req_n_procs), dependencies, workflow_id, wait_time=wait_time, user_id=user_id, group_id=group_id, resource=-1) task_state = TaskState(submit_time, submit_time + run_time, workflow_id, task_id, -1, canonical_memory_usage=used_memory, local_disk_space_usage=disk_space_used, maximum_network_bandwidth=used_network) match = re.search('DAGNext=([\d,]+);?', job_structure) if not match: children = set() else: children_string = match.group(1) children = set( mmh3.hash64("workflow:{}_task:{}".format(workflow_index, int(dep))) [0] for dep in children_string.split(",")) task.children = children return task, task_state