def parse_workload(pegasus_db_path, filename=NAME, workload_domain="", workload_description=""): if not os.path.exists(TARGET_DIR): os.makedirs(TARGET_DIR) conn = sqlite3.connect(pegasus_db_path) c = conn.cursor() workflows = parse_workflows(c) for w in workflows: w.compute_critical_path() # Write the workflow objects to parquet os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame([workflow.get_parquet_dict() for workflow in workflows]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for wf in workflows for task in wf.tasks]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time) pyarrow_task_schema = Task.get_pyarrow_schema() table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False) # Pandas does not know the different between an empty list and a list with integers # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly # using a schema. pq.write_table(table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet")) # generate workload description authors_list = [] w = Workload(workflows, workload_domain, authors_list, workload_description) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe(task_df, domain="Scientific", authors=["Pegasus Team"], workload_description="" # TODO fill in ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default)) conn.close()
def parse_askalon_file(askalon_file): if not os.path.exists(TARGET_DIR): os.makedirs(TARGET_DIR) workflows = [] with open(askalon_file, 'r') as asklon_trace: data = json.load(asklon_trace) for wf in data: workflows.append(parse_workflow(wf, askalon_file)) for w in workflows: w.compute_critical_path() # Write the workflow objects to parquet os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame( [workflow.get_parquet_dict() for workflow in workflows]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write all tasks to parquet os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame( [task.get_parquet_dict() for wf in workflows for task in wf.tasks]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign( ts_submit=lambda x: x['ts_submit'] - min_submit_time) pyarrow_task_schema = Task.get_pyarrow_schema() table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False) # Pandas does not know the different between an empty list and a list with integers # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly # using a schema. pq.write_table( table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet")) # generate workload description authors_list = ["Roland Matha", "Radu Prodan"] # generate workload description workload_description = "" if "bwa" in askalon_file.lower(): workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome." elif "wien2k" in askalon_file.lower(): workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids." workload_domain = "Scientific" w = Workload(workflows, workload_domain, authors_list, workload_description) # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain=workload_domain, authors=authors_list, workload_description=workload_description) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) raise TypeError file.write(json.dumps(json_dict, default=default))