Esempio n. 1
0
    def analyse(self):
        requtilization = 0
        actutilization = 0
        software = {"Unmanaged": 0}
        nnodes = []
        reqdurations = []
        actdurations = []
        nleases = len(self.workload.get_leases())
        for lease in self.workload.get_leases():
            if lease.start.requested == "Unspecified":
                start = lease.submit_time
            else:
                start = lease.start.requested

            if start + lease.duration.requested > self.starttime + self.utilization_length:
                reqduration = (self.starttime + self.utilization_length - start).seconds
            else: 
                reqduration = lease.duration.requested.seconds

            if lease.duration.known != None:
                if start + lease.duration.known > self.starttime + self.utilization_length:
                    actduration = (self.starttime + self.utilization_length - start).seconds
                else: 
                    actduration = lease.duration.known.seconds
            else:
                actduration = reqduration
                            
            for res in lease.requested_resources.values():
                for i in range(1,res.get_ninstances("CPU") + 1):
                    requtilization += (res.get_quantity_instance("CPU", i) / 100.0) * reqduration
                    actutilization += (res.get_quantity_instance("CPU", i) / 100.0) * actduration

            nnodes.append(len(lease.requested_resources))
            reqdurations.append(lease.duration.requested.seconds)
            
            if lease.duration.known != None:
                actdurations.append(lease.duration.known.seconds)
            
            if isinstance(lease.software, UnmanagedSoftwareEnvironment):
                software["Unmanaged"] += 1
            elif isinstance(lease.software, DiskImageSoftwareEnvironment):
                image = lease.software.image_id
                software[image] = software.setdefault(image, 0) +1
                
        if self.site != None:
            max_utilization = 0
            duration = self.utilization_length.seconds
            for res in self.site.nodes.get_all_nodes().values():
                for i in range(1,res.get_ninstances("CPU") + 1):
                    max_utilization += (res.get_quantity_instance("CPU", i)/100.0) * duration
                    
        if self.verbose:
            reqdurd = {}
            nnodesd = {}
            for reqduration in reqdurations:
                reqdurd[reqduration] = reqdurd.setdefault(reqduration, 0) +1        
            for n in nnodes:
                nnodesd[n] = nnodesd.setdefault(n, 0) +1        
                    
                    
        print actutilization
        print max_utilization
        print "Requested utilization: %.2f%%" % ((requtilization / max_utilization) * 100.0)
        print "   Actual utilization: %.2f%%" % ((actutilization / max_utilization) * 100.0)
        print
        #sorted_images = sorted(software.iteritems(), key=operator.itemgetter(1), reverse=True)
        print "NODES"
        print "-----"
        print_percentiles(nnodes)
        print
        if self.verbose:
            print "NODES (distribution)"
            print "--------------------"
            print_distribution(nnodesd, nleases)
            print
        print "REQUESTED DURATIONS"
        print "-------------------"
        print_percentiles(reqdurations)
        print
        if self.verbose:
            print "REQUESTED DURATIONS (distribution)"
            print "----------------------------------"
            print_distribution(reqdurd, nleases)
            print
        print "ACTUAL DURATIONS"
        print "----------------"
        print_percentiles(actdurations)
        print
        print "IMAGES"
        print "------"
        print_distribution(software, nleases)
        #for image, count in sorted_images:
        #    print "%s: %i (%.2f%%)" % (image, count, (float(count)/nleases)*100)
        print        
Esempio n. 2
0
    def run(self):            
        self.parse_options()

        infile = self.opt.inf
        outfile = self.opt.outf
        
        from_time = Parser.DateTimeDeltaFromString(self.opt.from_time)
        if self.opt.interval_length == None:
            to_time = None
        else:
            to_time = from_time + Parser.DateTimeDeltaFromString(self.opt.interval_length)

        root = ET.Element("lease-workload")
        root.set("name", infile)
        description = ET.SubElement(root, "description")
        description.text = "Created with haizea-swf2lwf %s" % " ".join(self.argv[1:])

        if self.opt.site != None:
            site_elem = ET.parse(self.opt.site).getroot()
            site_num_nodes = int(site_elem.find("nodes").find("node-set").get("numnodes"))
            root.append(site_elem)
        
        time = TimeDelta(seconds=0)
        requests = ET.SubElement(root, "lease-requests")
        
        slowdowns = []
        users = set()
        utilization = 0
        utilization_no_ramp = 0
        if to_time == None:
            swf = open(infile, 'r')
            lines = swf.readlines()
            lastline = lines[-1]
            to_time = TimeDelta(seconds=int(lastline.split()[1]))
            swf.close()

        no_ramp_cutoff = from_time + ((to_time - from_time) * 0.05)

        infile = open(infile, "r")
        for line in infile:
            if line[0]!=';' and len(line.strip()) != 0:
                fields = line.split()
                
                # Unpack the job's attributes. The description of each field is
                # taken from the SWF documentation at
                # http://www.cs.huji.ac.il/labs/parallel/workload/swf.html
                
                # Job Number -- a counter field, starting from 1. 
                job_number = int(fields[0])
                
                # Submit Time -- in seconds. The earliest time the log refers to is zero, 
                # and is the submittal time the of the first job. The lines in the log are 
                # sorted by ascending submittal times. It makes sense for jobs to also be 
                # numbered in this order.
                submit_time = int(fields[1])

                # Wait Time -- in seconds. The difference between the job's submit time 
                # and the time at which it actually began to run. Naturally, this is only 
                # relevant to real logs, not to models.
                wait_time = int(fields[2])

                # Run Time -- in seconds. The wall clock time the job was running (end 
                # time minus start time).
                # We decided to use ``wait time'' and ``run time'' instead of the equivalent 
                # ``start time'' and ``end time'' because they are directly attributable to 
                # the scheduler and application, and are more suitable for models where only 
                # the run time is relevant.
                # Note that when values are rounded to an integral number of seconds (as 
                # often happens in logs) a run time of 0 is possible and means the job ran 
                # for less than 0.5 seconds. On the other hand it is permissable to use 
                # floating point values for time fields.
                run_time = int(fields[3])
                
                # Number of Allocated Processors -- an integer. In most cases this is also 
                # the number of processors the job uses; if the job does not use all of them, 
                # we typically don't know about it.
                num_processors_allocated = int(fields[4])
                
                # Average CPU Time Used -- both user and system, in seconds. This is the 
                # average over all processors of the CPU time used, and may therefore be 
                # smaller than the wall clock runtime. If a log contains the total CPU time 
                # used by all the processors, it is divided by the number of allocated 
                # processors to derive the average.
                avg_cpu_time = float(fields[5])
                
                # Used Memory -- in kilobytes. This is again the average per processor.
                used_memory = int(fields[6])
                
                # Requested Number of Processors.
                num_processors_requested = int(fields[7])
                
                # Requested Time. This can be either runtime (measured in wallclock seconds), 
                # or average CPU time per processor (also in seconds) -- the exact meaning 
                # is determined by a header comment. In many logs this field is used for 
                # the user runtime estimate (or upper bound) used in backfilling. If a log 
                # contains a request for total CPU time, it is divided by the number of 
                # requested processors.
                time_requested = int(fields[8])
                
                # Requested Memory (again kilobytes per processor).
                mem_requested = int(fields[9])
                
                # Status 1 if the job was completed, 0 if it failed, and 5 if cancelled. 
                # If information about chekcpointing or swapping is included, other values 
                # are also possible. See usage note below. This field is meaningless for 
                # models, so would be -1.
                status = int(fields[10])
                
                # User ID -- a natural number, between one and the number of different users.
                user_id = int(fields[11])
                
                # Group ID -- a natural number, between one and the number of different groups. 
                # Some systems control resource usage by groups rather than by individual users.
                group_id = int(fields[12])
                
                # Executable (Application) Number -- a natural number, between one and the number 
                # of different applications appearing in the workload. in some logs, this might 
                # represent a script file used to run jobs rather than the executable directly; 
                # this should be noted in a header comment.
                exec_number = int(fields[13])
                
                # Queue Number -- a natural number, between one and the number of different 
                # queues in the system. The nature of the system's queues should be explained 
                # in a header comment. This field is where batch and interactive jobs should 
                # be differentiated: we suggest the convention of denoting interactive jobs by 0.
                queue = int(fields[14])
                
                # Partition Number -- a natural number, between one and the number of different 
                # partitions in the systems. The nature of the system's partitions should be 
                # explained in a header comment. For example, it is possible to use partition 
                # numbers to identify which machine in a cluster was used.
                partition = int(fields[15])
                
                # Preceding Job Number -- this is the number of a previous job in the workload, 
                # such that the current job can only start after the termination of this preceding 
                # job. Together with the next field, this allows the workload to include feedback 
                # as described below.
                prec_job = int(fields[16])

                # Think Time from Preceding Job -- this is the number of seconds that should elapse 
                # between the termination of the preceding job and the submittal of this one. 
                prec_job_thinktime = int(fields[17])

                                
                # Check if we have to skip this job
                
                submit_time = TimeDelta(seconds=submit_time)
                
                if submit_time < from_time:
                    continue
                
                if to_time != None and submit_time > to_time:
                    break
                
                if run_time < 0 and status==5:
                    # This is a job that got cancelled while waiting in the queue
                    continue
                
                if self.opt.queues != None:
                    queues = [int(q) for q in self.opt.queues.split(",")]
                    if queue not in queues:
                        # Job was submitted to a queue we're filtering out
                        continue              
                    
                if num_processors_requested == -1:
                    num_processors = num_processors_allocated
                else:
                    num_processors = num_processors_requested
        
                if self.opt.scale != None:
                    num_processors = int(num_processors/int(self.opt.scale))
                    
                lease_request = ET.SubElement(requests, "lease-request")
                # Make submission time relative to starting time of trace
                lease_request.set("arrival", str(submit_time - from_time))

                if run_time == 0:
                    # As specified in the SWF documentation, a runtime of 0 means
                    # the job ran for less than a second, so we round up to 1.
                    run_time = 1 
                realduration = ET.SubElement(lease_request, "realduration")
                realduration.set("time", str(TimeDelta(seconds=run_time)))
                
                lease = ET.SubElement(lease_request, "lease")
                lease.set("id", `job_number`)

                
                nodes = ET.SubElement(lease, "nodes")
                node_set = ET.SubElement(nodes, "node-set")
                node_set.set("numnodes", `num_processors`)
                res = ET.SubElement(node_set, "res")
                res.set("type", "CPU")
                res.set("amount", "100")

                res = ET.SubElement(node_set, "res")
                res.set("type", "Memory")
                if self.opt.mem != None:
                    res.set("amount", self.opt.mem)
                elif mem_requested != -1:
                    res.set("amount", `mem_requested / 1024`)
                else:
                    print "Cannot convert this file. Job #%i does not specify requested memory, and --memory parameter not specified" % job_number
                    exit(-1)
                    
                if wait_time != -1:
                    if run_time < 10:
                        run_time2 = 10.0
                    else:
                        run_time2 = float(run_time)
                    slowdown = (wait_time + run_time2) / run_time2
                    slowdowns.append(slowdown)
                
                if not user_id in users:
                    users.add(user_id)

                # Total utilization
                utilization += run_time * num_processors

                # Removing ramp-up and ramp-down effects
                if wait_time != -1 and submit_time + run_time >= no_ramp_cutoff:
                    start_in_interval = max(no_ramp_cutoff, submit_time)
                    end_in_interval = min(to_time, submit_time + run_time)
                    time_in_interval = end_in_interval - start_in_interval
                    utilization_no_ramp += time_in_interval * num_processors

                start = ET.SubElement(lease, "start")
                lease.set("preemptible", self.opt.preemptible)
                lease.set("user", `user_id`)

                duration_elem = ET.SubElement(lease, "duration")
                duration_elem.set("time", str(TimeDelta(seconds=time_requested)))

                # No software environment specified. The annotator would have to be used to
                # add one (or an image file when running a simulation).
                software = ET.SubElement(lease, "software")
                diskimage = ET.SubElement(software, "none")
                
                # Add unused SWF attributes to the extra section, for future reference.
                extra = ET.SubElement(lease, "extra")
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_waittime")
                attr.set("value", `wait_time`)
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_runtime")
                attr.set("value", `run_time`)
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_avgcputime")
                attr.set("value", `avg_cpu_time`)
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_queue")
                attr.set("value", `queue`)
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_group")
                attr.set("value", `group_id`)
                attr = ET.SubElement(extra, "attr")
                attr.set("name", "SWF_execnumber")
                attr.set("value", `exec_number`)
                    
        tree = ET.ElementTree(root)
        
        outfile = open(outfile, "w")
        tree.write(outfile)
        
        infile.close()
        outfile.close()
        
        slowdowns.sort()
        total_capacity = site_num_nodes * (to_time - from_time).seconds
        print utilization, total_capacity
        utilization = float(utilization) / float(total_capacity)
        utilization_no_ramp = float(utilization_no_ramp) / float(total_capacity)
        
        if len(slowdowns) > 0:
            print "SLOWDOWNS"
            print "---------"
            print_percentiles(slowdowns)
            print 
        print "USERS"
        print "-----"
        print "Number of users: %i" % len(users)
        print 
        print "UTILIZATION"
        print "-----------"
        print "Utilization: %.2f%%" % (utilization * 100)
        if utilization_no_ramp != 0:
            print "Utilization (no ramp-up/ramp-down): %.2f%%" % (utilization_no_ramp * 100)