def analyse(self): requtilization = 0 actutilization = 0 software = {"Unmanaged": 0} nnodes = [] reqdurations = [] actdurations = [] nleases = len(self.workload.get_leases()) for lease in self.workload.get_leases(): if lease.start.requested == "Unspecified": start = lease.submit_time else: start = lease.start.requested if start + lease.duration.requested > self.starttime + self.utilization_length: reqduration = (self.starttime + self.utilization_length - start).seconds else: reqduration = lease.duration.requested.seconds if lease.duration.known != None: if start + lease.duration.known > self.starttime + self.utilization_length: actduration = (self.starttime + self.utilization_length - start).seconds else: actduration = lease.duration.known.seconds else: actduration = reqduration for res in lease.requested_resources.values(): for i in range(1,res.get_ninstances("CPU") + 1): requtilization += (res.get_quantity_instance("CPU", i) / 100.0) * reqduration actutilization += (res.get_quantity_instance("CPU", i) / 100.0) * actduration nnodes.append(len(lease.requested_resources)) reqdurations.append(lease.duration.requested.seconds) if lease.duration.known != None: actdurations.append(lease.duration.known.seconds) if isinstance(lease.software, UnmanagedSoftwareEnvironment): software["Unmanaged"] += 1 elif isinstance(lease.software, DiskImageSoftwareEnvironment): image = lease.software.image_id software[image] = software.setdefault(image, 0) +1 if self.site != None: max_utilization = 0 duration = self.utilization_length.seconds for res in self.site.nodes.get_all_nodes().values(): for i in range(1,res.get_ninstances("CPU") + 1): max_utilization += (res.get_quantity_instance("CPU", i)/100.0) * duration if self.verbose: reqdurd = {} nnodesd = {} for reqduration in reqdurations: reqdurd[reqduration] = reqdurd.setdefault(reqduration, 0) +1 for n in nnodes: nnodesd[n] = nnodesd.setdefault(n, 0) +1 print actutilization print max_utilization print "Requested utilization: %.2f%%" % ((requtilization / max_utilization) * 100.0) print " Actual utilization: %.2f%%" % ((actutilization / max_utilization) * 100.0) print #sorted_images = sorted(software.iteritems(), key=operator.itemgetter(1), reverse=True) print "NODES" print "-----" print_percentiles(nnodes) print if self.verbose: print "NODES (distribution)" print "--------------------" print_distribution(nnodesd, nleases) print print "REQUESTED DURATIONS" print "-------------------" print_percentiles(reqdurations) print if self.verbose: print "REQUESTED DURATIONS (distribution)" print "----------------------------------" print_distribution(reqdurd, nleases) print print "ACTUAL DURATIONS" print "----------------" print_percentiles(actdurations) print print "IMAGES" print "------" print_distribution(software, nleases) #for image, count in sorted_images: # print "%s: %i (%.2f%%)" % (image, count, (float(count)/nleases)*100) print
def run(self): self.parse_options() infile = self.opt.inf outfile = self.opt.outf from_time = Parser.DateTimeDeltaFromString(self.opt.from_time) if self.opt.interval_length == None: to_time = None else: to_time = from_time + Parser.DateTimeDeltaFromString(self.opt.interval_length) root = ET.Element("lease-workload") root.set("name", infile) description = ET.SubElement(root, "description") description.text = "Created with haizea-swf2lwf %s" % " ".join(self.argv[1:]) if self.opt.site != None: site_elem = ET.parse(self.opt.site).getroot() site_num_nodes = int(site_elem.find("nodes").find("node-set").get("numnodes")) root.append(site_elem) time = TimeDelta(seconds=0) requests = ET.SubElement(root, "lease-requests") slowdowns = [] users = set() utilization = 0 utilization_no_ramp = 0 if to_time == None: swf = open(infile, 'r') lines = swf.readlines() lastline = lines[-1] to_time = TimeDelta(seconds=int(lastline.split()[1])) swf.close() no_ramp_cutoff = from_time + ((to_time - from_time) * 0.05) infile = open(infile, "r") for line in infile: if line[0]!=';' and len(line.strip()) != 0: fields = line.split() # Unpack the job's attributes. The description of each field is # taken from the SWF documentation at # http://www.cs.huji.ac.il/labs/parallel/workload/swf.html # Job Number -- a counter field, starting from 1. job_number = int(fields[0]) # Submit Time -- in seconds. The earliest time the log refers to is zero, # and is the submittal time the of the first job. The lines in the log are # sorted by ascending submittal times. It makes sense for jobs to also be # numbered in this order. submit_time = int(fields[1]) # Wait Time -- in seconds. The difference between the job's submit time # and the time at which it actually began to run. Naturally, this is only # relevant to real logs, not to models. wait_time = int(fields[2]) # Run Time -- in seconds. The wall clock time the job was running (end # time minus start time). # We decided to use ``wait time'' and ``run time'' instead of the equivalent # ``start time'' and ``end time'' because they are directly attributable to # the scheduler and application, and are more suitable for models where only # the run time is relevant. # Note that when values are rounded to an integral number of seconds (as # often happens in logs) a run time of 0 is possible and means the job ran # for less than 0.5 seconds. On the other hand it is permissable to use # floating point values for time fields. run_time = int(fields[3]) # Number of Allocated Processors -- an integer. In most cases this is also # the number of processors the job uses; if the job does not use all of them, # we typically don't know about it. num_processors_allocated = int(fields[4]) # Average CPU Time Used -- both user and system, in seconds. This is the # average over all processors of the CPU time used, and may therefore be # smaller than the wall clock runtime. If a log contains the total CPU time # used by all the processors, it is divided by the number of allocated # processors to derive the average. avg_cpu_time = float(fields[5]) # Used Memory -- in kilobytes. This is again the average per processor. used_memory = int(fields[6]) # Requested Number of Processors. num_processors_requested = int(fields[7]) # Requested Time. This can be either runtime (measured in wallclock seconds), # or average CPU time per processor (also in seconds) -- the exact meaning # is determined by a header comment. In many logs this field is used for # the user runtime estimate (or upper bound) used in backfilling. If a log # contains a request for total CPU time, it is divided by the number of # requested processors. time_requested = int(fields[8]) # Requested Memory (again kilobytes per processor). mem_requested = int(fields[9]) # Status 1 if the job was completed, 0 if it failed, and 5 if cancelled. # If information about chekcpointing or swapping is included, other values # are also possible. See usage note below. This field is meaningless for # models, so would be -1. status = int(fields[10]) # User ID -- a natural number, between one and the number of different users. user_id = int(fields[11]) # Group ID -- a natural number, between one and the number of different groups. # Some systems control resource usage by groups rather than by individual users. group_id = int(fields[12]) # Executable (Application) Number -- a natural number, between one and the number # of different applications appearing in the workload. in some logs, this might # represent a script file used to run jobs rather than the executable directly; # this should be noted in a header comment. exec_number = int(fields[13]) # Queue Number -- a natural number, between one and the number of different # queues in the system. The nature of the system's queues should be explained # in a header comment. This field is where batch and interactive jobs should # be differentiated: we suggest the convention of denoting interactive jobs by 0. queue = int(fields[14]) # Partition Number -- a natural number, between one and the number of different # partitions in the systems. The nature of the system's partitions should be # explained in a header comment. For example, it is possible to use partition # numbers to identify which machine in a cluster was used. partition = int(fields[15]) # Preceding Job Number -- this is the number of a previous job in the workload, # such that the current job can only start after the termination of this preceding # job. Together with the next field, this allows the workload to include feedback # as described below. prec_job = int(fields[16]) # Think Time from Preceding Job -- this is the number of seconds that should elapse # between the termination of the preceding job and the submittal of this one. prec_job_thinktime = int(fields[17]) # Check if we have to skip this job submit_time = TimeDelta(seconds=submit_time) if submit_time < from_time: continue if to_time != None and submit_time > to_time: break if run_time < 0 and status==5: # This is a job that got cancelled while waiting in the queue continue if self.opt.queues != None: queues = [int(q) for q in self.opt.queues.split(",")] if queue not in queues: # Job was submitted to a queue we're filtering out continue if num_processors_requested == -1: num_processors = num_processors_allocated else: num_processors = num_processors_requested if self.opt.scale != None: num_processors = int(num_processors/int(self.opt.scale)) lease_request = ET.SubElement(requests, "lease-request") # Make submission time relative to starting time of trace lease_request.set("arrival", str(submit_time - from_time)) if run_time == 0: # As specified in the SWF documentation, a runtime of 0 means # the job ran for less than a second, so we round up to 1. run_time = 1 realduration = ET.SubElement(lease_request, "realduration") realduration.set("time", str(TimeDelta(seconds=run_time))) lease = ET.SubElement(lease_request, "lease") lease.set("id", `job_number`) nodes = ET.SubElement(lease, "nodes") node_set = ET.SubElement(nodes, "node-set") node_set.set("numnodes", `num_processors`) res = ET.SubElement(node_set, "res") res.set("type", "CPU") res.set("amount", "100") res = ET.SubElement(node_set, "res") res.set("type", "Memory") if self.opt.mem != None: res.set("amount", self.opt.mem) elif mem_requested != -1: res.set("amount", `mem_requested / 1024`) else: print "Cannot convert this file. Job #%i does not specify requested memory, and --memory parameter not specified" % job_number exit(-1) if wait_time != -1: if run_time < 10: run_time2 = 10.0 else: run_time2 = float(run_time) slowdown = (wait_time + run_time2) / run_time2 slowdowns.append(slowdown) if not user_id in users: users.add(user_id) # Total utilization utilization += run_time * num_processors # Removing ramp-up and ramp-down effects if wait_time != -1 and submit_time + run_time >= no_ramp_cutoff: start_in_interval = max(no_ramp_cutoff, submit_time) end_in_interval = min(to_time, submit_time + run_time) time_in_interval = end_in_interval - start_in_interval utilization_no_ramp += time_in_interval * num_processors start = ET.SubElement(lease, "start") lease.set("preemptible", self.opt.preemptible) lease.set("user", `user_id`) duration_elem = ET.SubElement(lease, "duration") duration_elem.set("time", str(TimeDelta(seconds=time_requested))) # No software environment specified. The annotator would have to be used to # add one (or an image file when running a simulation). software = ET.SubElement(lease, "software") diskimage = ET.SubElement(software, "none") # Add unused SWF attributes to the extra section, for future reference. extra = ET.SubElement(lease, "extra") attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_waittime") attr.set("value", `wait_time`) attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_runtime") attr.set("value", `run_time`) attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_avgcputime") attr.set("value", `avg_cpu_time`) attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_queue") attr.set("value", `queue`) attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_group") attr.set("value", `group_id`) attr = ET.SubElement(extra, "attr") attr.set("name", "SWF_execnumber") attr.set("value", `exec_number`) tree = ET.ElementTree(root) outfile = open(outfile, "w") tree.write(outfile) infile.close() outfile.close() slowdowns.sort() total_capacity = site_num_nodes * (to_time - from_time).seconds print utilization, total_capacity utilization = float(utilization) / float(total_capacity) utilization_no_ramp = float(utilization_no_ramp) / float(total_capacity) if len(slowdowns) > 0: print "SLOWDOWNS" print "---------" print_percentiles(slowdowns) print print "USERS" print "-----" print "Number of users: %i" % len(users) print print "UTILIZATION" print "-----------" print "Utilization: %.2f%%" % (utilization * 100) if utilization_no_ramp != 0: print "Utilization (no ramp-up/ramp-down): %.2f%%" % (utilization_no_ramp * 100)