Example #1
0
    def __init__(self, *args, **kwargs):
        
        ClusterBaseSystem.__init__(self, *args, **kwargs)
                
        self.sleep_interval = kwargs.get("sleep_interval", 0)
        
        self.fraction = kwargs.get("cluster_fraction", 1)
        self.sim_start = kwargs.get("c_trace_start", 0)
        self.sim_end = kwargs.get("c_trace_end", sys.maxint)
        self.anchor = kwargs.get("anchor", 0)
        
        self.workload_file =  kwargs.get("cjob")
        self.output_log = MACHINE_NAME + "-" + kwargs.get("outputlog", "")
        self.bgjob = kwargs.get("bgjob")
        
        self.event_manager = ComponentProxy("event-manager")
      
        walltime_prediction = get_histm_config("walltime_prediction", False)   # *AdjEst*
        print "walltime_prediction=", walltime_prediction
        if walltime_prediction in ["True", "true"]:
            self.walltime_prediction = True
        else:
            self.walltime_prediction = False
            
        self.time_stamps = [('I', '0', 0, {})]
        self.cur_time_index = 0
        self.queues = SimQueueDict(policy=None)
        
 #       self.invisible_job_dict = {}   # for jobs not submitted, {jobid:job_instance}
        self.unsubmitted_job_spec_dict = {}   #{jobid: jobspec}

        self.num_running = 0
        self.num_waiting = 0
        self.num_busy = 0
        self.num_end = 0
        self.total_job = 0
        self.total_nodes = len(self.all_nodes)
                
        self.init_queues()

        #initialize PBS-style logger
        self.pbslog = PBSlogger(self.output_log)
        
        #initialize debug logger
        if self.output_log:
            self.dbglog = PBSlogger(self.output_log+"-debug")
        else:
            self.dbglog = PBSlogger(".debug")
        
        #finish tag
        self.finished = False
        
        #register local alias "system" for this component
        local_components["cluster-system"] = self
        
        #initialize capacity loss
        self.capacity_loss = 0
                
        #starting job(id)s at current time stamp. used for calculating capacity loss
        self.starting_jobs = []
        
        self.user_utility_functions = {}
        self.builtin_utility_functions = {}
                        
        self.define_builtin_utility_functions()
        self.define_user_utility_functions()
        
        self.cosched_scheme_tup = kwargs.get("coscheduling", (0,0))
        self.cosched_scheme = self.cosched_scheme_tup[1]
        self.cosched_scheme_remote = self.cosched_scheme_tup[0]
        self.mate_vicinity = kwargs.get("vicinity", 0)
        self.mate_ratio = kwargs.get("mate_ratio", 0)
        
        valid_cosched_schemes = ["hold", "yield"]
        
        if self.cosched_scheme in valid_cosched_schemes and self.cosched_scheme_remote in valid_cosched_schemes:
            self.coscheduling = True
        else:
            self.coscheduling = False
            
        if not kwargs.get("bgjob", None):
            self.coscheduling = False
            
        self.mate_job_dict = {}
            
        if self.coscheduling:
            self.jobid_qtime_pairs =  self.init_jobid_qtime_pairs()           
            try:
                self.remote_jobid_qtime_pairs = ComponentProxy(REMOTE_QUEUE_MANAGER).get_jobid_qtime_pairs()
            except:
                self.logger.error("fail to connect to remote queue-manager component!")
                self.coscheduling = False

            if self.mate_vicinity:
                print "start init mate job dict, vicinity=", self.mate_vicinity
                self.init_mate_job_dict_by_vicinity()
            elif self.mate_ratio:
                print "start init mate job dict, mate_ratio=", self.mate_ratio
                self.init_mate_job_dict_by_ratio(self.mate_ratio)
            else:
                self.logger.error("fail to initialize mate job dict!")
            
            matejobs = len(self.mate_job_dict.keys())
            proportion = float(matejobs) / self.total_job
       
        #recording holding job id and holden resource    
        self.job_hold_dict = {}
        
        #record holding job's holding time   jobid:first hold (sec)
        self.first_hold_time_dict = {} 
            
        #record yield jobs's first yielding time, for calculating the extra waiting time
        self.first_yield_hold_time_dict = {}
        
        #record yield job ids. update dynamically
        self.yielding_job_list = []
        
        if self.coscheduling:
            remote_mate_job_dict = dict((v,k) for k, v in self.mate_job_dict.iteritems())
            try:
                ComponentProxy(REMOTE_QUEUE_MANAGER).set_mate_job_dict(remote_mate_job_dict)
            except:
                self.logger.error("failed to connect to remote queue-manager component!")
                self.coscheduling = False
            print "number of mate job pairs: %s, proportion in cluster jobs: %s%%" \
            % (len(self.mate_job_dict.keys()), round(proportion *100, 1) )
            
        self.max_holding_sys_util = DEFAULT_MAX_HOLDING_SYS_UTIL
Example #2
0
 def __init__ (self, *args, **kwargs):
     ClusterBaseSystem.__init__(self, *args, **kwargs)
     self.process_groups.item_cls = ClusterProcessGroup
Example #3
0
 def __setstate__(self, state):
     ClusterBaseSystem.__setstate__(self, state)
     self.process_groups.item_cls = ClusterProcessGroup
 def __setstate__(self, state):
     ClusterBaseSystem.__setstate__(self, state)
     self.process_groups.item_cls = ClusterProcessGroup
 def __init__ (self, *args, **kwargs):
     ClusterBaseSystem.__init__(self, *args, **kwargs)
     self.process_groups.item_cls = ClusterProcessGroup
Example #6
0
 def __getstate__(self):
     state = {}
     state.update(ClusterBaseSystem.__getstate__(self))
     # state.update({
     #         "cluster_system_version": 1 })
     return state
Example #7
0
 def __getstate__(self):
     state = {}
     state.update(ClusterBaseSystem.__getstate__(self))
     # state.update({
     #         "cluster_system_version": 1 })
     return state
Example #8
0
    def __init__(self, *args, **kwargs):
        
        ClusterBaseSystem.__init__(self, *args, **kwargs)
                
        self.sleep_interval = kwargs.get("sleep_interval", 0)
        
        self.fraction = kwargs.get("cluster_fraction", 1)
        self.sim_start = kwargs.get("c_trace_start", 0)
        self.sim_end = kwargs.get("c_trace_end", sys.maxint)
        self.anchor = kwargs.get("anchor", 0)
        
        self.workload_file =  kwargs.get("cjob")
        self.output_log = MACHINE_NAME + "-" + kwargs.get("outputlog", "")
        self.bgjob = kwargs.get("bgjob")
        
        self.event_manager = ComponentProxy("event-manager")
      
        walltime_prediction = get_histm_config("walltime_prediction", False)   # *AdjEst*
        print "walltime_prediction=", walltime_prediction
        if walltime_prediction in ["True", "true"]:
            self.walltime_prediction = True
        else:
            self.walltime_prediction = False
            
        self.time_stamps = [('I', '0', 0, {})]
        self.cur_time_index = 0
        self.queues = SimQueueDict(policy=None)
        
 #       self.invisible_job_dict = {}   # for jobs not submitted, {jobid:job_instance}
        self.unsubmitted_job_spec_dict = {}   #{jobid: jobspec}

        self.num_running = 0
        self.num_waiting = 0
        self.num_busy = 0
        self.num_end = 0
        self.total_job = 0
        self.total_nodes = len(self.all_nodes)
                
        self.init_queues()

        #initialize PBS-style logger
        self.pbslog = PBSlogger(self.output_log)
        
        #initialize debug logger
        if self.output_log:
            self.dbglog = PBSlogger(self.output_log+"-debug")
        else:
            self.dbglog = PBSlogger(".debug")
        
        #finish tag
        self.finished = False
        
        #register local alias "system" for this component
        local_components["cluster-system"] = self
        
        #initialize capacity loss
        self.capacity_loss = 0
                
        #starting job(id)s at current time stamp. used for calculating capacity loss
        self.starting_jobs = []
        
        self.user_utility_functions = {}
        self.builtin_utility_functions = {}
                        
        self.define_builtin_utility_functions()
        self.define_user_utility_functions()
        
        self.cosched_scheme_tup = kwargs.get("coscheduling", (0,0))
        self.cosched_scheme = self.cosched_scheme_tup[1]
        self.cosched_scheme_remote = self.cosched_scheme_tup[0]
        self.mate_vicinity = kwargs.get("vicinity", 0)
        self.mate_ratio = kwargs.get("mate_ratio", 0)
        
        valid_cosched_schemes = ["hold", "yield"]
        
        if self.cosched_scheme in valid_cosched_schemes and self.cosched_scheme_remote in valid_cosched_schemes:
            self.coscheduling = True
        else:
            self.coscheduling = False
            
        if not kwargs.get("bgjob", None):
            self.coscheduling = False
            
        self.mate_job_dict = {}
            
        if self.coscheduling:
            self.jobid_qtime_pairs =  self.init_jobid_qtime_pairs()           
            try:
                self.remote_jobid_qtime_pairs = ComponentProxy(REMOTE_QUEUE_MANAGER).get_jobid_qtime_pairs()
            except:
                self.logger.error("fail to connect to remote queue-manager component!")
                self.coscheduling = False

            if self.mate_vicinity:
                print "start init mate job dict, vicinity=", self.mate_vicinity
                self.init_mate_job_dict_by_vicinity()
            elif self.mate_ratio:
                print "start init mate job dict, mate_ratio=", self.mate_ratio
                self.init_mate_job_dict_by_ratio(self.mate_ratio)
            else:
                self.logger.error("fail to initialize mate job dict!")
            
            matejobs = len(self.mate_job_dict.keys())
            proportion = float(matejobs) / self.total_job
       
        #recording holding job id and holden resource    
        self.job_hold_dict = {}
        
        #record holding job's holding time   jobid:first hold (sec)
        self.first_hold_time_dict = {} 
            
        #record yield jobs's first yielding time, for calculating the extra waiting time
        self.first_yield_hold_time_dict = {}
        
        #record yield job ids. update dynamically
        self.yielding_job_list = []
        
        if self.coscheduling:
            remote_mate_job_dict = dict((v,k) for k, v in self.mate_job_dict.iteritems())
            try:
                ComponentProxy(REMOTE_QUEUE_MANAGER).set_mate_job_dict(remote_mate_job_dict)
            except:
                self.logger.error("failed to connect to remote queue-manager component!")
                self.coscheduling = False
            print "number of mate job pairs: %s, proportion in cluster jobs: %s%%" \
            % (len(self.mate_job_dict.keys()), round(proportion *100, 1) )
            
        self.max_holding_sys_util = DEFAULT_MAX_HOLDING_SYS_UTIL