def machine_meta_to_resources(row):
     resource = Resource(
         id=mmh3.hash64(row["machine_id"])[1],
         type="cpu",
         num_resources=float(row["cpu_num"]),
         memory=row["mem_size"],
     )
     resource_dict = resource.get_json_dict()
     del resource_dict["events"]
     return SparkRow(**resource_dict)
Beispiel #2
0
 def generateMap(self, size, resources, spots, repartition, totalRes,
                 delta):
     totalSize = size[0] * size[1]
     cases = list(range(0, totalSize))
     hqX, hqY = settings.DEFAULT_HQ_POS
     cases.remove(hqX * hqY)
     hqX, hqY = settings.DEFAULT_TRANSMITTER_POS
     cases.remove(hqX * hqY)
     numpy.random.seed(self._seed)
     resList = []
     for i, res in enumerate(resources):
         # compute spot number
         spotNumber = self.getRandomDelta(totalSize * spots[i], delta)
         amountBySpot = self.getRandomDelta(
             totalRes * repartition[i],
             delta) / spotNumber  # uniform distribution
         print(amountBySpot)
         for j in range(0, spotNumber):
             position = numpy.random.choice(cases, 1)
             cases.remove(position)
             resList.append(
                 Resource(
                     (int(position / size[1]) + settings.BORDER_TILES_NUM,
                      int(position % size[1]) + settings.BORDER_TILES_NUM),
                     (1, 1), res, amountBySpot))
     return resList
Beispiel #3
0
def parse_resources(c):
    resources = {}
    c.execute("SELECT * FROM host")
    for host_id, wf_id, site, hostname, ip, uname, total_memory in c.fetchall():
        details = {}
        details["ip"] = ip
        details["hostname"] = hostname
        resources[host_id] = Resource(host_id, site, -1, -1, total_memory, -1, -1, uname, details)
    return resources
Beispiel #4
0
    def __init__(self, **attributes):
        for attr_name, attr_value in attributes.items():
            setattr(self, attr_name, attr_value)
        # self.nb_machine = nb_machine
        # self.nb_jobs = nb_jobs
        # self.problem = problem
        self.resource_list = []
        self.jobs_list = []

        # Create Resources
        for i in range(self.nb_machine):
            self.resource_list.append(Resource(i))
        # Create jobs
        for i in range(self.nb_jobs):
            self.jobs_list.append(Job(i, self.problem[i], self.resource_list))
 def __init__(self, nom):
     self.data, self.optimum = loader(name=nom)
     self.nb_machine = self.data['nb_machine']
     self.nb_jobs = self.data['nb_jobs']
     self.problem = self.data['problem']
     self.nom = nom
     self.resource_list = []
     self.jobs_list = []
     self.makeSpan = -1
     self.criticalPath = []
     self.state = "Not Solved"
     # Create Resources
     for i in range(self.nb_machine):
         self.resource_list.append(Resource(i))
     # Create jobs
     for i in range(self.nb_jobs):
         self.jobs_list.append(Job(i, self.problem[i], self.resource_list, self))
Beispiel #6
0
def parse_and_return_task_dataframe(file_path):
    global TARGET_DIR
    with open(file_path) as trace:
        json_data = json.load(trace)

        workflow = json_data['workflow']
        tasks = workflow['jobs']
        machines = workflow['machines']
        date = json_data['createdAt']

        # Convert the ts_submit to seconds instead of a datetime string
        task_date = dateparser.parse(date)
        EPOCH = datetime(1970, 1, 1, tzinfo=task_date.tzinfo)
        ts_submit = int((task_date - EPOCH).total_seconds() * 1000)

        resource_by_id = dict()

        for machine in machines:
            machine_id = mmh3.hash64("machine:{0}".format(machine['machine_code'].strip()))[0]
            machine = machine["machine"]
            num_cpus = machine['cpu']['count']
            details = {
                "cpu_vendor": machine['cpu']['vendor'],
                "architecture": machine['architecture']
            }
            memory_in_gb = int(machine['memory']) / float(1024 * 1024)
            res = Resource(machine_id, "cluster_node", num_cpus, machine['release'], memory_in_gb, -1, -1,
                           machine['system'], details)
            resource_by_id[machine_id] = res

        task_list = []
        task_state_list = []
        inputs_per_taskid = dict()
        outputs_per_taskid = dict()
        outputs_matched = dict()

        task_per_taskid = dict()

        input_file_data_per_task_id = dict()
        output_file_data_per_task_id = dict()
        for task in tasks:
            task_id = mmh3.hash64("task:{}".format(str(task['name']).strip()))[0]
            print(task_id)
            task_files = task['files'] if 'files' in task else []
            task_type = task['type']
            task_cores = task['cores'] if 'cores' in task else 1
            task_memory = task['memory'] if 'memory' in task else -1
            task_runtime = task['runtime'] * 1000 if 'runtime' in task else -1
            task_dependencies = [mmh3.hash64("task:{}".format(str(p).strip()))[0] for p in
                                 task['parents']]
            task_parameters = {"arguments": task['arguments']} if 'arguments' in task else {}
            task_machine = mmh3.hash64("machine:{0}".format(task['machine'].strip()))[0] if 'machine' in task else None
            task_resource = resource_by_id[task_machine].id if 'machine' in task else -1
            # Convert energy to Wh from KWh
            task_total_energy_consumption = float(task['energy']) * 1000 if 'energy' in task else -1

            t = Task(task_id, task_type, ts_submit, -1, task_runtime, task_cores, task_dependencies, 0, -1,
                     params=task_parameters, resource=task_resource, energy_consumption=task_total_energy_consumption,
                     resource_type="core")

            task_per_taskid[task_id] = t
            task_list.append(t)

            # Parse the data transfers
            for file_item in task_files:
                # Apparently not all traces were parsed into version 0.2 despite them being in the
                # folders for 0.2. To this end we need a check for the file name and size fields.
                file_name = file_item['name'] if 'name' in file_item else file_item['fileId']
                file_size = file_item['size'] if 'size' in file_item else -1

                # Store the incoming and outgoing data to this task in separate dicts
                if file_item['link'] == "input":
                    if task_id not in inputs_per_taskid:
                        inputs_per_taskid[task_id] = set()
                        input_file_data_per_task_id[task_id] = dict()
                        inputs_per_taskid[task_id].add(file_name)

                    try:
                        input_file_data_per_task_id[task_id][file_name] = file_size
                    except:
                        print(file_item)
                        exit(-1)

                elif file_item['link'] == "output":
                    if task_id not in outputs_per_taskid:
                        outputs_per_taskid[task_id] = set()
                        outputs_matched[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()

                    outputs_per_taskid[task_id].add(file_name)
                    outputs_matched[task_id][file_name] = False
                    output_file_data_per_task_id[task_id][file_name] = file_size

            # Create a task state for the entire duration with
            task_state = TaskState(ts_submit, ts_submit + task_runtime, 0, task_id, -1,
                                   canonical_memory_usage=task_memory)
            task_state_list.append(task_state)

        # Make sure the earliest task starts at 0.
        min_ts_submit = min(task.ts_submit for task in task_list)
        for task in task_list:
            # Update the time
            task.ts_submit -= min_ts_submit
            for parent in task.parents:  # Also since we have all parent info, set them in the same loop
                task_per_taskid[parent].children.add(task.id)

        # Offset task states too
        for taskstate in task_state_list:
            taskstate.ts_start -= min_ts_submit
            taskstate.ts_end -= min_ts_submit

        data_transfer_id = 0
        # Since tasks can output files with the same name as other tasks, we must loop over a task's parents
        # and match the output names against input names.
        for task in task_list:  # For every task we have
            if task.id not in inputs_per_taskid: continue
            inputs = inputs_per_taskid[task.id]
            # We loop over the parents (no need to check children, they will come later)
            for dep in task.parents:
                outputs = outputs_per_taskid[dep] if dep in outputs_per_taskid else set()
                overlap = inputs.intersection(outputs)  # Check for overlap
                if len(overlap) > 0:  # We have input-output pairs, loop to construct datatransfers
                    for file_name in overlap:
                        # Get the size and construct a datatransfer object.
                        data_size = output_file_data_per_task_id[dep][file_name]
                        datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id,
                                                    data_size)
                        # Assign it to the tasks
                        task_per_taskid[dep].datatransfers.append(datatransfer)
                        task.datatransfers.append(datatransfer)
                        outputs_matched[dep][file_name] = True

                        # Remove the file from the input as it's covered. Do NOT remove it from output,
                        # the same output file may be used by another task (fan-out structure).
                        inputs.remove(file_name)
                        data_transfer_id += 1

            # Loop over the remaining input files. Since we do not have a source, we assume them are present
            # on the filesystem beforehand.
            for file_name in inputs:
                data_size = input_file_data_per_task_id[task.id][file_name]
                datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                task.datatransfers.append(datatransfer)
                data_transfer_id += 1

        # Loop over the outputs and create a datatransfer for those that are not matched yet
        # These are likely files with final results, not having an destination.
        for task_id in outputs_matched.keys():
            for file_name in outputs_matched[task_id].keys():
                if not outputs_matched[task_id][file_name]:
                    task = task_per_taskid[task_id]
                    data_size = output_file_data_per_task_id[task_id][file_name]
                    datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                    task.datatransfers.append(datatransfer)

        filename_for_this_partition = "part.0.parquet"

        # Write all tasks to parquet
        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
        task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])
        task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), filename_for_this_partition), engine="pyarrow")

        # Write all task states to parquet
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True)
        task_state_df = pd.DataFrame([task_state.get_parquet_dict() for task_state in task_state_list])
        task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), filename_for_this_partition),
                                 engine="pyarrow")

        # Write all data transfers to parquet
        if any(len(task.datatransfers) for task in task_list):
            os.makedirs(os.path.join(TARGET_DIR, Datatransfer.output_path()), exist_ok=True)
            datatransfer_df = pd.DataFrame(
                [datatransfer.get_parquet_dict() for task_item in task_list for datatransfer in
                 task_item.datatransfers])

            datatransfer_df.to_parquet(
                os.path.join(TARGET_DIR, Datatransfer.output_path(), filename_for_this_partition),
                engine="pyarrow")

        # Write the workflows to parquet
        wf_agnostic_df = compute_characteristics(task_df)
        workflow_ts_submit = task_df["ts_submit"].min()

        # Determine the application name and field
        application_names = {
            "epigenomics": ("Epigenomics", "Bioinformatics"),
            "montage": ("Montage", "Astronomy"),
            "soykb": ("SoyKB", "Bioinformatics"),
        }

        application_name = ""
        application_field = ""
        for key in application_names.keys():
            if key in file_path:
                application_name = application_names[key][0]
                application_field = application_names[key][1]

        workflow = Workflow(0, workflow_ts_submit, task_list, "Pegasus", "Scientific", application_name,
                            application_field)
        workflow.compute_critical_path()

        wf_df = pd.DataFrame([workflow.get_parquet_dict()])

        return wf_df
Beispiel #7
0
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1])

    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path to spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        print("######\nStart parsing Tasks\n######")
        task_df = spark.read.format('com.databricks.spark.csv').options(
            header='true', inferschema='true').load(
                os.path.join(path_to_dir, '*.csv.processed'))

        # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable
        task_df = task_df.drop('pref').filter(
            task_df.status == ":instance.status/success").drop(
                'status').cache()

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def sub_two_datetimes(s1, s2):
            arr = []
            for i in s1.keys():
                d1 = datetime.datetime.strptime(s1[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                d2 = datetime.datetime.strptime(s2[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                arr.append(int((d2 - d1).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df \
            .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \
            .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time')))

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def date_time_to_unix(series):
            arr = []
            epoch = datetime.datetime.utcfromtimestamp(0)
            for i in series.keys():
                arr.append(
                    np.int64((datetime.datetime.strptime(
                        series[i], '%a %b %d %H:%M:%S %Z %Y') -
                              epoch).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df.withColumn(
            'submit-time',
            date_time_to_unix(F.col('submit-time'))).withColumnRenamed(
                'submit-time',
                "ts_submit").drop('start-time').drop('end-time').cache()

        min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0]
        task_df = task_df.withColumn('ts_submit',
                                     F.col('ts_submit') - F.lit(min_ts))

        @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR)
        def convert_to_kb(v):
            return v * 1024

        task_df = task_df.withColumn('memory', convert_to_kb(
            task_df.memory)).withColumnRenamed("memory", "memory_consumption")

        @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR)
        def string_to_int(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash(v[i], signed=True))

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def string_to_long(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash64(v[i], signed=True)[0])

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def assign_workflow_ids(v):
            arr = []
            for i in v.keys():
                if v[i]:
                    arr.append(mmh3.hash64(v[i], signed=True)[0])
                else:
                    arr.append(
                        mmh3.hash64(uuid4().bytes, signed=True)
                        [0])  # Assign a UUID, collision chance is negligible.

            return pd.Series(arr)

        task_df = task_df.withColumn('user', string_to_int(
            task_df.user)).withColumnRenamed("user", "user_id")
        task_df = task_df.withColumn('job-uuid',
                                     string_to_long(
                                         F.col('job-uuid'))).withColumnRenamed(
                                             'job-uuid', 'task_id')

        type_udf = F.udf(lambda x: "Independent" if x is None else "Composite",
                         T.StringType())
        task_df = task_df.withColumn('type', type_udf(task_df.simset))

        task_df = task_df.withColumn('simset',
                                     assign_workflow_ids(
                                         F.col('simset'))).withColumnRenamed(
                                             'simset', "workflow_id")
        task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested')

        task_df = task_df.withColumnRenamed('instance', 'resource_used')

        # Set the static items that are not present in the trace
        task_df = task_df.withColumn('submission_site', F.lit(0))
        task_df = task_df.withColumn('parents',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('children',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('group_id', F.lit(0))
        task_df = task_df.withColumn('nfrs', F.lit("{}"))
        task_df = task_df.withColumn('params', F.lit("{}"))
        task_df = task_df.withColumn('memory_requested', F.lit(-1))
        task_df = task_df.withColumn('network_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_space_requested', F.lit(-1))
        task_df = task_df.withColumn('energy_consumption', F.lit(-1))

        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()),
                    exist_ok=True)
        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()),
                              mode="overwrite",
                              compression="snappy")
        print("######\nDone parsing Tasks\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\nStart parsing TaskState\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        task_state_structtype = T.StructType([
            T.StructField("ts_start", T.LongType(), False),
            T.StructField("ts_end", T.LongType(), False),
            T.StructField("workflow_id", T.LongType(), False),
            T.StructField("task_id", T.LongType(), False),
            T.StructField("resource_id", T.LongType(), False),
            T.StructField("cpu_rate", T.DoubleType(), False),
            T.StructField("canonical_memory_usage", T.DoubleType(), False),
            T.StructField("assigned_memory", T.DoubleType(), False),
            T.StructField("minimum_memory_usage", T.DoubleType(), False),
            T.StructField("maximum_memory_usage", T.DoubleType(), False),
            T.StructField("disk_io_time", T.DoubleType(), False),
            T.StructField("maximum_disk_bandwidth", T.DoubleType(), False),
            T.StructField("local_disk_space_usage", T.DoubleType(), False),
            T.StructField("maximum_cpu_rate", T.DoubleType(), False),
            T.StructField("maximum_disk_io_time", T.DoubleType(), False),
            T.StructField("sample_rate", T.DoubleType(), False),
            T.StructField("sample_portion", T.DoubleType(), False),
            T.StructField("sampled_cpu_usage", T.DoubleType(), False),
            T.StructField("network_io_time", T.DoubleType(), False),
            T.StructField("maximum_network_bandwidth", T.DoubleType(), False),
        ])

        @F.pandas_udf(returnType=task_state_structtype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_task_states(df):
            workflow_id = df['workflow_id'].iloc[0]
            task_id = df['task_id'].iloc[0]
            ts_start = df['ts_submit'].min()
            ts_end = ts_start + df['runtime'].max()
            resource_id = df['resource_used'].iloc[0]
            cpu_rate = -1
            canonical_memory_usage = df['memory_consumption'].mean()
            assigned_memory = -1
            minimum_memory_usage = df['memory_consumption'].min()
            maximum_memory_usage = df['memory_consumption'].max()
            disk_io_time = -1
            maximum_disk_bandwidth = -1
            local_disk_space_usage = -1
            maximum_cpu_rate = -1
            maximum_disk_io_time = -1
            sample_rate = -1
            sample_portion = -1
            sampled_cpu_usage = -1
            network_io_time = -1
            maximum_network_bandwidth = -1

            data_dict = {
                "ts_start": ts_start,
                "ts_end": ts_end,
                "workflow_id": workflow_id,
                "task_id": task_id,
                "resource_id": resource_id,
                "cpu_rate": cpu_rate,
                "canonical_memory_usage": canonical_memory_usage,
                "assigned_memory": assigned_memory,
                "minimum_memory_usage": minimum_memory_usage,
                "maximum_memory_usage": maximum_memory_usage,
                "disk_io_time": disk_io_time,
                "maximum_disk_bandwidth": maximum_disk_bandwidth,
                "local_disk_space_usage": local_disk_space_usage,
                "maximum_cpu_rate": maximum_cpu_rate,
                "maximum_disk_io_time": maximum_disk_io_time,
                "sample_rate": sample_rate,
                "sample_portion": sample_portion,
                "sampled_cpu_usage": sampled_cpu_usage,
                "network_io_time": network_io_time,
                "maximum_network_bandwidth": maximum_network_bandwidth,
            }

            return pd.DataFrame(data_dict, index=[0])

        task_state_df = task_df.groupBy(['workflow_id',
                                         'task_id']).apply(compute_task_states)
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()),
                    exist_ok=True)
        task_state_df.write.parquet(os.path.join(TARGET_DIR,
                                                 TaskState.output_path()),
                                    mode="overwrite",
                                    compression="snappy")
        print("######\nDone parsing TaskState\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\nStart parsing Resources\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        resource_id_column = [
            i.resource_used
            for i in task_df.select('resource_used').distinct().collect()
        ]

        resources = []
        for resource_id in resource_id_column:
            resources.append(
                Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1,
                         '').get_parquet_dict())

        resource_df = pd.DataFrame(resources)
        os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()),
                    exist_ok=True)
        resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(),
                                            'part.0.parquet'),
                               engine="pyarrow")
        print("######\nDone parsing Resources\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\nStart parsing Workflows\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(),
                          False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = "Cook"
            total_resources = df['resource_amount_requested'].sum()
            total_memory_usage = df['memory_consumption'].sum()
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id,
                "ts_submit": ts_submit,
                'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks,
                'nfrs': nfrs,
                'scheduler': scheduler,
                'total_resources': total_resources,
                'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage,
                'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        workflow_df = task_df.groupBy('workflow_id').apply(
            compute_workflow_stats)
        workflow_df.explain(True)
        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")
        print("######\nDone parsing Workflows\n######")

    print("######\nStart parsing Ẁorkload\n######")
    pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR,
                                                  Task.output_path()),
                                     engine="pyarrow")
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        pandas_task_df,
        domain="Industrial",
        start_date=None,
        end_date=None,
        authors=["Two Sigma"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
def parse_workflow(wf, filename):
    workflow_id = string2numeric_hash(wf['name'] + '-(' + wf['id'] + ')')
    workflow_domain = ""  # The domain the workflow belongs to, e.g. industry, science, etc.
    workflow_application_name = ""  # The name of the application, e.g. Montage, SIPHT
    workflow_appliation_field = ""  # The field of the application, e.g. bioinformatics, astronomy
    if "bwa" in filename.lower():
        workflow_id = string2numeric_hash("bwa" + '-(' + wf['id'] + ')')
        workflow_domain = "science"
        workflow_application_name = "Burroughs-Wheeler Alignment tool"
        workflow_appliation_field = "bioinformatics"
    elif "wien2k" in filename.lower():
        workflow_id = string2numeric_hash("wien2k" + '-(' + wf['id'] + ')')
        workflow_domain = "science"
        workflow_application_name = "Wien2k"
        workflow_appliation_field = "materials chemistry"
    resources = {}
    for r in wf['resources']:  # parse resources for tasks later
        r_details = r['details']
        os = "Linux"
        details = {}
        details['provider'] = r_details['provider']
        details['instanceType'] = r_details['instanceType']
        events = parse_events(r['events'])
        # id, type, num_resources, proc_model_name, memory, disk_space, network_bandwidth, operating_system, details=None, events=None
        resources[string2numeric_hash(r['id'])] = Resource(
            string2numeric_hash(r['name']), r['type'], r_details['vCPUs'],
            r_details['physicalProcessor'], r_details['memory'],
            r_details['storage'], r_details['networkPerformance'], os, details,
            events)
    # create list of tasks for parents
    if "wien2k" in filename.lower():
        first = []
        second = []
        third = []
        fourth = []
        last = []
        for t in wf['tasks']:
            if "first" in t['name'].lower():
                first.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "second" in t['name'].lower():
                second.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "third" in t['name'].lower():
                third.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "fourth" in t['name'].lower():
                fourth.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "last" in t['name'].lower():
                last.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
    elif "bwa" in filename.lower():
        bwaindex_split1_2 = []
        bwa1aln = []
        bwaconcat = []
        for t in wf['tasks']:
            if "bwa:bwaindex" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:split1" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:split2" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:bwa1aln" in t['name'].lower():
                bwa1aln.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:concat" in t['name'].lower():
                bwaconcat.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
    tasks = []
    for t in wf['tasks']:  # parse tasks
        if "cloud init" not in t['name'].lower(
        ) and "cloud instances" not in t['name'].lower(
        ) and "parforiteration" not in t['type'].lower(
        ) and "parallel" not in t['type'].lower(
        ) and "section" not in t['type'].lower():
            parents = []
            if "wien2k" in filename.lower():
                if "second" in t['name'].lower():
                    parents = first
                if "third" in t['name'].lower():
                    parents = second
                if "fourth" in t['name'].lower():
                    parents = third
                if "last" in t['name'].lower():
                    parents = fourth
            elif "bwa" in filename.lower():
                if "bwa:bwa1aln" in t['name'].lower():
                    parents = bwaindex_split1_2
                if "bwa:concat" in t['name'].lower():
                    parents = bwa1aln
            # print(parents)
            submission_site = string2numeric_hash(t['submissionSite'])
            res = None
            if submission_site != '':
                res = resources[submission_site]
            params = parse_params(t['params'])
            events = parse_events(t['events'])
            wait_time = 0
            if "ACTIVE" in events:
                wait_time = int((parse(events["ACTIVE"]) -
                                 parse(t['startTime'])).total_seconds() * 1000)
            # id, type, ts_submit,
            # submission_site, runtime, resource_amount_requested, parents,
            # workflow_id, wait_time, resource_type="cpu", resource=None, datatransfer=None, params=None, events=None, requirements=None, user_id=-1, group_id=-1, memory_requested=-1, disk_space_requested=-1, disk_io_time=-1, network_io_time=-1, energy_consumption=-1
            tasks.append(
                Task(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')),
                    t['type'], int(parse(t['startTime']).timestamp() * 1000),
                    submission_site,
                    int((parse(t['endTime']) -
                         parse(t['startTime'])).total_seconds() * 1000), 1,
                    parents, workflow_id, wait_time, "CPU", res,
                    parse_datatransfers(t['fileTransfers']), params, events))
    ts_start = min(t['startTime'] for t in wf['tasks'])
    # id, ts_submit, tasks, scheduler_description
    return Workflow(workflow_id,
                    int(parse(wf['beginTime']).timestamp() * 1000), tasks,
                    wf['scheduler'], workflow_domain,
                    workflow_application_name, workflow_appliation_field)
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[-1])

    if "DAS5" in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ["HOSTNAME"] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "256G") \
            .config("spark.driver.maxResultSize", "40G") \
            .config("spark.network.timeout", "100000s") \
            .config("spark.rpc.askTimeout", "100000s") \
            .config("spark.default.parallelism", "2000") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .config("spark.cleaner.periodicGC.interval", "5s") \
            .getOrCreate()
    else:
        import findspark
        findspark.init("<path_to_spark>")
        spark = SparkSession.builder \
            .master("local[4]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "2G") \
            .config("spark.driver.memory", "2G") \
            .getOrCreate()

    machine_meta = spark.read.csv(os.path.join(path_to_dir,
                                               "machine_meta.csv"),
                                  schema=StructType([
                                      StructField("machine_id", StringType(),
                                                  True),
                                      StructField("time_stamp", LongType(),
                                                  True),
                                      StructField("failure_domain_1",
                                                  LongType(), True),
                                      StructField("failure_domain_2",
                                                  StringType(), True),
                                      StructField("cpu_num", LongType(), True),
                                      StructField("mem_size", LongType(),
                                                  True),
                                      StructField("status", StringType(), True)
                                  ]))

    machine_usage = spark.read.csv(os.path.join(path_to_dir,
                                                "machine_usage.csv"),
                                   schema=StructType([
                                       StructField("machine_id", StringType(),
                                                   True),
                                       StructField("time_stamp", DoubleType(),
                                                   True),
                                       StructField("cpu_util_percent",
                                                   LongType(), True),
                                       StructField("mem_util_percent",
                                                   LongType(), True),
                                       StructField("mem_gps", DoubleType(),
                                                   True),
                                       StructField("mkpi", LongType(), True),
                                       StructField("net_in", DoubleType(),
                                                   True),
                                       StructField("net_out", DoubleType(),
                                                   True),
                                       StructField("disk_io_percent",
                                                   DoubleType(), True)
                                   ]))

    container_meta = spark.read.csv(
        os.path.join(path_to_dir, "container_meta.csv"),
        schema=StructType([
            StructField("container_id", StringType(), True),
            StructField("machine_id", StringType(), True),
            StructField("time_stamp", LongType(), True),
            StructField("app_du", StringType(), True),
            StructField("status", StringType(), True),
            StructField("cpu_request", LongType(), True),
            StructField("cpu_limit", LongType(), True),
            StructField("mem_size", DoubleType(), True)
        ]))

    container_usage = spark.read.csv(os.path.join(path_to_dir,
                                                  "container_usage.csv"),
                                     schema=StructType([
                                         StructField("container_id",
                                                     StringType(), True),
                                         StructField("machine_id",
                                                     StringType(), True),
                                         StructField("time_stamp",
                                                     DoubleType(), True),
                                         StructField("cpu_util_percent",
                                                     LongType(), True),
                                         StructField("mem_util_percent",
                                                     LongType(), True),
                                         StructField("cpi", DoubleType(),
                                                     True),
                                         StructField("mem_gps", DoubleType(),
                                                     True),
                                         StructField("mpki", LongType(), True),
                                         StructField("net_in", DoubleType(),
                                                     True),
                                         StructField("net_out", DoubleType(),
                                                     True),
                                         StructField("disk_io_percent",
                                                     DoubleType(), True)
                                     ]))

    batch_task = spark.read.csv(os.path.join(path_to_dir, "batch_task.csv"),
                                schema=StructType([
                                    StructField("task_name", StringType(),
                                                True),
                                    StructField("instance_num", LongType(),
                                                True),
                                    StructField("job_name", StringType(),
                                                True),
                                    StructField("task_type", StringType(),
                                                True),
                                    StructField("status", StringType(), True),
                                    StructField("start_time", LongType(),
                                                True),
                                    StructField("end_time", LongType(), True),
                                    StructField("plan_cpu", DoubleType(),
                                                True),
                                    StructField("plan_mem", DoubleType(), True)
                                ]))

    batch_instance = spark.read.csv(
        os.path.join(path_to_dir, "batch_instance.csv"),
        schema=StructType([
            StructField("instance_name", StringType(), True),
            StructField("task_name", StringType(), True),
            StructField("job_name", StringType(), True),
            StructField("task_type", StringType(), True),
            StructField("status", StringType(), True),
            StructField("start_time", LongType(), True),
            StructField("end_time", LongType(), True),
            StructField("machine_id", StringType(), True),
            StructField("seq_no", LongType(), True),
            StructField("total_seq_no", LongType(), True),
            StructField("cpu_avg", DoubleType(), True),
            StructField("cpu_max", DoubleType(), True),
            StructField("mem_avg", DoubleType(), True),
            StructField("mem_max", DoubleType(), True)
        ]))

    @F.pandas_udf(returnType=Task.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def clean_tasks_of_workflow(df):
        tasks = dict()
        raw_id_to_instances = dict()

        job_name = df.loc[0, "job_name"]
        workflow_id = mmh3.hash64(job_name)[1]

        invalid_task_raw_ids = set()

        # group by task name
        # - count number of instances
        # - compare with row.instance_num

        # Check to inspect if the data is noisy
        # def check(pdf):
        #     a = pdf["instance_name"].nunique()
        #     b = pdf["instance_name"].astype(np.int64).min()
        #     c = pdf["instance_name"].astype(np.int64).max()
        #     d = pdf["instance_num"].min()
        #     e = pdf["instance_num"].max()
        #     f = pdf["instance_name"].count()
        #     if d != e or b < 0 or c >= e or a != d or a != f:
        #         print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f))
        #
        # df.groupby("task_name").apply(check)

        for row in df.itertuples(index=False):
            if None in row:
                print(row, flush=True)
            task_name = row.task_name
            instance_name = str(row.instance_name)
            memory_requested = row.plan_mem
            resources_requested = row.plan_cpu
            resource_id = row.machine_id

            splits = task_name.split("_")

            if splits[0] == "task":
                cleaned_task_name = splits[1]
                task_type = "bag"
                raw_parents = []
            else:
                cleaned_task_name = splits[0][1:]
                task_type = str(splits[0][0])
                raw_parents = [x for x in splits[1:] if x.isdigit()]

            if resource_id is None:
                resource_id = -1
            else:
                resource_id = mmh3.hash64(row.machine_id)[1]

            if row.end_time is None or math.isnan(row.end_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if row.start_time is None or math.isnan(row.start_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if memory_requested is None or math.isnan(memory_requested):
                memory_requested = -1

            if resources_requested is None or math.isnan(resources_requested):
                avg_cpu = row.cpu_avg
                if avg_cpu is None or math.isnan(avg_cpu):
                    invalid_task_raw_ids.add(cleaned_task_name)
                    continue
                else:
                    resources_requested = avg_cpu

            this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name +
                                       "@" + instance_name)[1]

            if cleaned_task_name not in raw_id_to_instances:
                raw_id_to_instances[cleaned_task_name] = row.instance_num

            if row.instance_num > 10:
                # Create parent and child tasks
                raw_parent_id = cleaned_task_name + "_p"
                parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id +
                                             "@" + "0")[1]
                if parent_task_id not in tasks:
                    tasks[parent_task_id] = Task(
                        id=parent_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=raw_parents,
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1)
                    raw_id_to_instances[raw_parent_id] = 1

                raw_child_id = cleaned_task_name + "_c"
                child_task_id = mmh3.hash64(job_name + "@" + raw_child_id +
                                            "@" + "0")[1]
                if child_task_id not in tasks:
                    tasks[child_task_id] = Task(
                        id=child_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=[cleaned_task_name],
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1,
                        params="child")
                    raw_id_to_instances[raw_child_id] = 1

                raw_parents = [raw_parent_id]

            this_task = Task(
                id=this_task_id,
                type=task_type,
                submission_site=0,
                runtime=(row.end_time - row.start_time) * 1000,
                ts_submit=row.start_time *
                1000,  # We convert time from seconds to milliseconds.
                resource_amount_requested=resources_requested,
                parents=raw_parents,
                workflow_id=workflow_id,
                params=task_name + " $ " + instance_name + " $ " +
                str(row.instance_num) + " $ " + job_name,
                wait_time=0,
                resource_type='core',
                resource=resource_id,
                memory_requested=memory_requested)

            tasks[this_task_id] = this_task

        for task_id, task in tasks.items():
            task.parents = [
                p for p in task.parents if p not in invalid_task_raw_ids
            ]
            parents = []
            for raw_parent_id in task.parents:
                # If previous wave has a child and this task is not that child.
                # refer to the child instead of the wave.
                if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child":
                    raw_parent_id = raw_parent_id + "_c"

                # We might hit an edge case where a parent was not recorded by the system of Alibaba
                # (e.g. bug or the tracing stopped)
                if raw_parent_id not in raw_id_to_instances:
                    continue

                parent_instances = raw_id_to_instances[raw_parent_id]

                proper_parent_ids = []
                for x in range(parent_instances):
                    # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data
                    # So we need to check if they are actually encountered.
                    hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" +
                                       str(x))[1]
                    if hash in tasks:
                        proper_parent_ids.append(hash)

                parents.extend(proper_parent_ids)
                for proper_id in proper_parent_ids:
                    tasks[proper_id].children.add(task_id)

            # task.params = None
            task.parents = parents

        # ze_best = pd.concat(pandas_dataframes)
        parquet_dicts = [task.get_parquet_dict() for task in tasks.values()]
        if len(tasks) > 0:
            ret = pd.DataFrame(parquet_dicts)
        else:  # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom.
            ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys())
        return ret

    @F.pandas_udf(returnType=Task.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def container_to_task(df):
        row = df.iloc[0, :]
        start_time = df["time_stamp"].min() * 1000
        stop_time = df["time_stamp"].max() * 1000
        task_id = mmh3.hash64(row["container_id"])[1]
        workflow_id = mmh3.hash64(row["app_du"])[1]

        task = Task(
            id=task_id,
            type="long running",
            parents=[],
            ts_submit=
            start_time,  # We convert time from seconds to milliseconds.
            submission_site=0,
            runtime=(start_time - stop_time),
            resource_amount_requested=row["cpu_request"],
            memory_requested=row["mem_size"],
            workflow_id=workflow_id,
            wait_time=0,
            resource=mmh3.hash64(row["machine_id"])[1])

        return pd.DataFrame([task.get_parquet_dict()])

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        # Rename instances
        # This allows instance names to be derived using just the task name and number of instances of the task.
        task_window = Window.partitionBy("job_name",
                                         "task_name").orderBy("start_time")
        # Subtracting 1 becasue row number starts at 0. Makes later iteration more intuitive.
        # We are using instance name as an index in a particular job and task.
        instances_renamed = batch_instance.withColumn(
            "instance_name",
            (F.row_number().over(task_window) - F.lit(1)).cast(StringType()))

        tasks_unconverted = instances_renamed.join(
            batch_task.select("job_name", "task_name", "instance_num",
                              "plan_cpu", "plan_mem"),
            on=["job_name", "task_name"],
            how="inner")

        # 100% this line is the issue.
        tasks_converted = tasks_unconverted.groupby("job_name").apply(
            clean_tasks_of_workflow)

        # if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        #     tasks_converted.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite")

        long_running_tasks = container_meta.groupBy("container_id").apply(
            container_to_task)

        all_tasks = tasks_converted.union(long_running_tasks).dropna()

        try:
            all_tasks.printSchema()
            all_tasks.write.parquet(os.path.join(TARGET_DIR,
                                                 Task.output_path()),
                                    mode="overwrite")
        except Exception as e:
            print(e, flush=True)
            raise e

    @F.pandas_udf(returnType=TaskState.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def task_states_from_instances(df):
        task_states = []

        workflow_id = mmh3.hash64(df.loc[0, "job_name"])[1]

        for index, row in df.iterrows():
            job_name = row["job_name"]
            task_name = row["task_name"]
            instance_name = row["instance_name"]

            splits = task_name.split("_")
            just_task_name = splits[0][
                1:]  # The first letter is irrelevant as it corresponds to nature of task (map or reduce)
            # and has nothing to do with the structure of the workflow.

            this_task_id = mmh3.hash64(job_name + "@" + just_task_name + "@" +
                                       instance_name)[1]

            this_task_state = TaskState(ts_start=row["start_time"] * 1000,
                                        ts_end=row["end_time"] * 1000,
                                        workflow_id=workflow_id,
                                        task_id=this_task_id,
                                        resource_id=mmh3.hash64(
                                            row["machine_id"])[1],
                                        cpu_rate=row["cpu_avg"],
                                        canonical_memory_usage=row["mem_avg"],
                                        maximum_cpu_rate=row["cpu_max"],
                                        maximum_memory_usage=row["mem_max"])

            if None in this_task_state.get_parquet_dict().values() or np.isnan(
                    this_task_state.get_parquet_dict().values()):
                print(this_task_state.get_parquet_dict())
                raise RuntimeError(this_task_state.get_parquet_dict())
            task_states.append(this_task_state.get_parquet_dict())

        return pd.DataFrame(task_states)

    @F.pandas_udf(returnType=TaskState.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def task_states_from_container_usage(df):
        machine_id = mmh3.hash64(df.loc[0, "machine_id"])[1]

        def convert(cont_df):
            task_states = []

            prev_end_time = cont_df.loc[0, "start_time"] * 1000
            container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1]
            app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1]

            sorted_df = df.sort_values("time_stamp")
            for index, row in sorted_df.iterrows():
                this_end_time = row["time_stamp"] * 1000

                this_task_state = TaskState(
                    ts_start=prev_end_time,
                    ts_end=this_end_time,
                    workflow_id=app_id,
                    task_id=container_id,
                    resource_id=machine_id,
                    cpu_rate=row["cpu_util_percent"],
                    canonical_memory_usage=row["mem_util_percent"],
                    maximum_disk_bandwidth=row["disk_io_percent"],
                    network_in=row["net_in"],
                    network_out=row["net_out"])

                prev_end_time = this_end_time

                task_states.append(this_task_state.get_parquet_dict())
                if None in this_task_state.get_parquet_dict().values(
                ) or np.isnan(this_task_state.get_parquet_dict().values()):
                    print(this_task_state.get_parquet_dict())
                    raise ArithmeticError(this_task_state.get_parquet_dict())

            return pd.DataFrame(task_states)

        return df.groupby("container_id").apply(convert).reset_index(
            drop=True).fillna(-1)

    # Now, derive workflows from tasks
    @F.pandas_udf(returnType=Workflow.get_spark_type(),
                  functionType=F.PandasUDFType.GROUPED_MAP)
    def compute_workflow_stats(df):
        tasks = []

        for index, row in df.iterrows():
            this_task = Task(
                id=row["id"],
                type=row["type"],
                ts_submit=row["ts_submit"],
                # We convert time from seconds to milliseconds.
                submission_site=0,
                runtime=row["runtime"],
                resource_amount_requested=row["resource_amount_requested"],
                memory_requested=row["memory_requested"],
                parents=row["parents"],
                workflow_id=row["workflow_id"],
                wait_time=row["wait_time"],
                resource=row["resource_used"])
            # print(this_task.get_parquet_dict())
            tasks.append(this_task)

        workflow = Workflow(id=df.loc[0, "workflow_id"],
                            ts_submit=df["ts_submit"].min(),
                            tasks=tasks,
                            scheduler_description="Fuxi",
                            workflow_domain="Industrial",
                            workflow_application_name="MapReduce",
                            workflow_appliation_field="Internet Services")

        try:
            workflow.compute_critical_path()
        except toposort.CircularDependencyError:  # TODO: Some have cyclic dependencies. Check if this is us, or the data (again)
            pass

        return pd.DataFrame([workflow.get_parquet_dict()])

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        tasks_df = spark.read.parquet(
            os.path.join(TARGET_DIR, Task.output_path())
        )  # Spark doesn't understand it can now read from files, so tell him
        workflow_df = tasks_df.groupBy("workflow_id").apply(
            compute_workflow_stats)

        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")

    def machine_meta_to_resources(row):
        resource = Resource(
            id=mmh3.hash64(row["machine_id"])[1],
            type="cpu",
            num_resources=float(row["cpu_num"]),
            memory=row["mem_size"],
        )
        resource_dict = resource.get_json_dict()
        del resource_dict["events"]
        return SparkRow(**resource_dict)

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\n Start parsing Resource DF\n ######")
        resource_df = machine_meta.rdd.map(machine_meta_to_resources).toDF(
            Resource.get_spark_type())
        resource_df.write.parquet(os.path.join(TARGET_DIR,
                                               Resource.output_path()),
                                  mode="overwrite",
                                  compression="snappy")

    print("######\n Start parsing Workload\n ######")
    if "tasks_df" not in locals():
        tasks_df = spark.read.parquet(
            os.path.join(TARGET_DIR, Task.output_path())
        )  # Spark doesn't understand it can now read from parquet files, so tell him
    json_dict = Workload.get_json_dict_from_spark_task_dataframe(
        tasks_df, domain="Industrial", authors=["Alibaba 2018"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)
    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
    print("######\n Done parsing Workload\n ######")
Beispiel #10
0
def parse(path_to_dir):
    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path_to_spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    # Convert times which are in microseconds and do not fit in a long to milliseconds
    convert_micro_to_milliseconds = F.udf(lambda x: x / 1000)

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\n Start parsing TaskState\n ######")
        task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
            os.path.join(path_to_dir, 'task_usage', '*.csv'))
        # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
        #     'fake_task_usage.csv')
        oldColumns = task_usage_df.schema.names
        newColumns = ["ts_start",
                      "ts_end",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "cpu_rate",
                      "memory_consumption",
                      "assigned_memory_usage",
                      "unmapped_page_cache",
                      "total_page_cache",
                      "max_memory_usage",
                      "mean_disk_io_time",
                      "mean_local_disk_space_usage",
                      "max_cpu_rate",
                      "max_disk_io_time",
                      "cycles_per_instruction",
                      "memory_accesses_per_instruction",
                      "sample_portion",
                      "aggregation_type",
                      "sampled_cpu_usage", ]

        task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                               range(len(oldColumns)), task_usage_df)

        # Drop columns with too low level details
        task_usage_df = task_usage_df.drop('memory_accesses_per_instruction')
        task_usage_df = task_usage_df.drop('cycles_per_instruction')
        task_usage_df = task_usage_df.drop('unmapped_page_cache')
        task_usage_df = task_usage_df.drop('total_page_cache')

        # Conver the timestamps from micro to milliseconds and cast them to long.
        task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start')))
        task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end')))
        task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType()))

        # Some fields have weird symbols in them, clean those.
        truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x)
        task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id')))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate')))

        # Now that the columns have been sanitized, cast them to the right type
        task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType()))

        task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite",
                                    compression="snappy")
        print("######\n Done parsing TaskState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):

        if 'task_usage_df' not in locals():
            task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path()))

        print("######\n Start parsing Tasks\n ######")
        task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST",
                                                                        parserLib="univocity").load(
            os.path.join(path_to_dir, 'task_events', '*.csv'))

        oldColumns = task_df.schema.names
        newColumns = ["ts_submit",
                      "missing_info",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "event_type",
                      "user_id",
                      "scheduler",
                      "nfrs",
                      "resources_requested",
                      "memory_requested",
                      "disk_space_request",
                      "machine_restrictions", ]

        task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                         range(len(oldColumns)), task_df)

        task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit')))
        task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType()))

        # Filter tasks that never reached completion
        task_df.createOrReplaceTempView("task_table")
        task_df = spark.sql("""WITH filtered_tasks AS (
        SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id
            FROM task_table t1
            WHERE t1.event_type IN(0, 1, 4)
            group by t1.workflow_id, t1.id
            having count(distinct event_type) = 3
        )
    SELECT t.*
    FROM task_table t INNER JOIN filtered_tasks f
    ON t.id = f.id AND t.workflow_id = f.workflow_id""")

        task_aggregation_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("type", T.StringType(), True),
            T.StructField("ts_submit", T.LongType(), True),
            T.StructField("submission_site", T.LongType(), True),
            T.StructField("runtime", T.LongType(), True),
            T.StructField("resource_type", T.StringType(), True),
            T.StructField("resource_amount_requested", T.DoubleType(), True),
            T.StructField("parents", T.ArrayType(T.LongType()), True),
            T.StructField("children", T.ArrayType(T.LongType()), True),
            T.StructField("user_id", T.LongType(), True),
            T.StructField("group_id", T.LongType(), True),
            T.StructField("nfrs", T.StringType(), True),
            T.StructField("wait_time", T.LongType(), True),
            T.StructField("params", T.StringType(), True),
            T.StructField("memory_requested", T.DoubleType(), True),
            T.StructField("network_io_time", T.DoubleType(), True),
            T.StructField("disk_space_requested", T.DoubleType(), True),
            T.StructField("energy_consumption", T.DoubleType(), True),
            T.StructField("resource_used", T.StringType(), True),
        ])

        # Compute based on the event type
        @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_aggregated_task_usage_metrics(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True)
            task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True)
            task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True)

            if None in [task_start_time, task_submit_time, task_finish_time]:
                return None

            task_resource_request = df['resources_requested'].max(skipna=True)
            task_memory_request = df['memory_requested'].max(skipna=True)
            task_priority = df['nfrs'].max(skipna=True)
            task_disk_space_requested = df['disk_space_request'].max(skipna=True)

            task_machine_id_list = df.resource_id.unique()

            task_waittime = int(task_start_time) - int(task_submit_time)
            task_runtime = int(task_finish_time) - int(task_start_time)

            def default(o):
                if isinstance(o, np.int64):
                    return int(o)

            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "type": "",  # Unknown
                "ts_submit": task_submit_time,
                "submission_site": -1,  # Unknown
                "runtime": task_runtime,
                "resource_type": "core",  # Fields are called CPU, but they are core count (see Google documentation)
                "resource_amount_requested": task_resource_request,
                "parents": [],
                "children": [],
                "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0],
                "group_id": -1,
                "nfrs": json.dumps({"priority": task_priority}, default=default),
                "wait_time": task_waittime,
                "params": "{}",
                "memory_requested": task_memory_request,
                "network_io_time": -1,  # Unknown
                "disk_space_requested": task_disk_space_requested,
                "energy_consumption": -1,  # Unknown
                "resource_used": json.dumps(task_machine_id_list, default=default),
            }

            return pd.DataFrame(data_dict, index=[0])

        task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics)
        task_df.explain(True)

        # Now add disk IO time - This cannot be done in the previous Pandas UDF function as
        # accessing another dataframe in the apply function is not allowed
        disk_io_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("disk_io_time", T.DoubleType(), True),
        ])

        @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_disk_io_time(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000
            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "disk_io_time": disk_io_time
            }

            return pd.DataFrame(data_dict, index=[0])

        disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy(
            ["workflow_id", "id"]).apply(compute_disk_io_time)
        disk_io_df.explain(True)

        join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id)
        task_df = task_df.join(disk_io_df, ["workflow_id", "id"])

        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy")
        print("######\n Done parsing Tasks\n ######")
    else:
        task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path()))

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\n Start parsing Resource\n ######")
        # Parse the machine information in the traces, these should match with the resource_ids in task_usage
        resources_structtype = T.StructType([  # Using StringTypes as we drop those columns
            T.StructField("time", T.StringType(), False),
            T.StructField("id", T.LongType(), False),
            T.StructField("attribute_name", T.StringType(), False),
            T.StructField("attribute_value", T.StringType(), False),
            T.StructField("attribute_deleted", T.StringType(), False),
        ])

        resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv'))

        resource_df = resource_df.select(["id"])  # Only keep the ID, the rest we do not need.

        # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al.
        # in their ATC 2018 article.
        resource_df = resource_df.withColumn('type', F.lit("core"))
        resource_df = resource_df.withColumn('num_resources', F.lit(8))
        resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona"))
        resource_df = resource_df.withColumn('memory', F.lit(-1))
        resource_df = resource_df.withColumn('disk_space', F.lit(-1))
        resource_df = resource_df.withColumn('network', F.lit(-1))
        resource_df = resource_df.withColumn('os', F.lit(""))
        resource_df = resource_df.withColumn('details', F.lit("{}"))

        # Write the resource_df to the specified location
        resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Resource\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())):
        print("######\n Start parsing ResourceState\n ######")
        resource_events_structtype = T.StructType([
            T.StructField("timestamp", T.DecimalType(20, 0), False),
            T.StructField("machine_id", T.LongType(), False),
            T.StructField("event_type", T.IntegerType(), False),
            T.StructField("platform_id", T.StringType(), False),
            T.StructField("available_resources", T.FloatType(), False),
            T.StructField("available_memory", T.FloatType(), False),
        ])

        resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv'))

        resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp')))
        resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType()))

        resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1))

        # Write the resource_df to the specified location
        resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite",
                                        compression="snappy")
        print("######\n Done parsing ResourceState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\n Start parsing Workflows\n ######")
        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1  # We do not know the task dependencies, so -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = ""
            total_resources = df['resource_amount_requested'].sum()  # TODO or assigned?
            total_memory_usage = df['memory_requested'].sum()  # TODO or consumption, or assigned?
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id, "ts_submit": ts_submit, 'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler,
                'total_resources': total_resources, 'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        # Create and write the workflow dataframe
        workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats)

        workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Workflows\n ######")

    print("######\n Start parsing Workload\n ######")
    json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df,
                                                                 domain="Industrial",
                                                                 start_date="2011-05-01",
                                                                 end_date="2011-05-30",
                                                                 authors=["Google"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)
    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
    print("######\n Done parsing Workload\n ######")