def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 self.arrival_time = 0 self.failure_time = 0 self.has_resources = False self.is_runnable = False self.is_visible = False self.args = [] self.progress = 0 self.recovery_opt = spec.get("recovery_opt", RECOVERYOPT) self.checkpoint = 1 self.location = []
def __init__(self, spec, logger): Data.__init__(self, spec) self.tag = "process group" self.args = " ".join(spec.get("args", [])) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.true_mpi_args = spec.get("true_mpi_args") self.umask = spec.get("umask") self.user = spec.get("user", "") self.logger = logger
def __init__(self, spec): Data.__init__(self, spec) self.tag = "process group" # self.args = " ".join(spec.get("args", [])) self.args = spec.get("args", []) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.umask = spec.get("umask") self.user = spec.get("user", "") self.starttime = spec.get("starttime") self.walltime = spec.get("walltime") self.killtime = spec.get("killtime") self.resid = spec.get("resid", None) self.runid = spec.get("runid", None) self.forker = spec.get("forker", None)
def __init__(self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.scheduled = spec.pop("scheduled", False) self.name = spec.pop("name", None) self.functional = spec.pop("functional", False) self.queue = spec.pop("queue", "default") self.size = spec.pop("size", None) # these hold Partition objects self._parents = set() self._children = set() self._all_children = set() self.state = spec.pop("state", "idle") self.tag = spec.get("tag", "partition") self.bridge_partition = None self.node_cards = spec.get("node_cards", []) self.switches = spec.get("switches", []) self.reserved_until = False self.reserved_by = None self.used_by = None self.cleanup_pending = False # this holds partition names self._wiring_conflicts = set() self.backfill_time = None self.draining = False self._update_node_cards()
def __init__ (self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.scheduled = spec.pop("scheduled", False) self.name = spec.pop("name", None) self.functional = spec.pop("functional", False) self.queue = spec.pop("queue", "default") self.size = spec.pop("size", None) # these hold Partition objects self._parents = set() self._children = set() self._all_children = set() self.state = spec.pop("state", "idle") self.tag = spec.get("tag", "partition") self.bridge_partition = None self.node_cards = spec.get("node_cards", []) self.switches = spec.get("switches", []) self.reserved_until = False self.reserved_by = None self.used_by = None self.cleanup_pending = False # this holds partition names self._wiring_conflicts = set() self.backfill_time = None self.draining = False self._update_node_cards()
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.name = spec.pop("name") self.location = spec.pop("location") self.tag = spec.get("tag", "service") self.stamp = time.time()
def __init__ (self, spec): Data.__init__(self, spec) spec = spec.copy() self.name = spec.pop("name") self.location = spec.pop("location") self.tag = spec.get("tag", "service") self.stamp = time.time()
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get( 'last_hold', 0 ) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get( 'first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = spec.get( 'location', '' ) #original location read from job trace, used for job reservation #samnickolay self.pricing_queue_position = -1 self.original_pricing_queue_position = -1 self.max_price = -1 self.max_slowdown = -1 self.price_slowdown_quotes = [] self.originally_realtime = False self.quoted_price = -1 self.quoted_slowdown = -1 self.quoted_slowdown_time = 1.1 self.estimated_slowdown_at_runtime = -1 self.in_high_priority_queue = False self.original_log_runtime = spec.get("original_log_runtime", 0)
def __init__ (self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.machine = spec.get("machine", 0) self.type = spec.get("type", "I") self.datetime = spec.get("datetime", None) self.unixtime = spec.get("unixtime", None) self.jobid = spec.get("jobid", 0) self.location = spec.get("location", {})
def __init__(self, spec): Data.__init__(self, spec) self.tag = "Resource" self.functional = spec.get("functional", False) self.name = spec.get("name", None) self.queue = spec.get("queue", "default") self.scheduled = spec.get("scheduled", False) self.size = 1 self.state = spec.get("state", "idle") self.attributes = spec.get("attributes", {})
def __init__ (self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration")
def __init__(self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration")
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get( 'last_hold', 0 ) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get( 'first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = [] self.torus = spec.get("torus", False)
def __init__(self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration") self.res_id = spec.get("res_id") self.cycle_id_gen = bgsched_cycle_id_gen if self.cycle: self.cycle_id = spec.get("cycle_id", self.cycle_id_gen.get()) else: self.cycle_id = None self.running = False self.project = spec.get("project", None)
def __init__ (self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration") self.res_id = spec.get("res_id") self.cycle_id_gen = bgsched_cycle_id_gen if self.cycle: self.cycle_id = spec.get("cycle_id",self.cycle_id_gen.get()) else: self.cycle_id = None self.running = False self.project = spec.get("project", None)
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get('last_hold', 0) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get('first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = spec.get('location', '') #original location read from job trace, used for job reservation self.io_cnt = spec.get('io_cnt', 0) self.io_size = spec.get('io_size', 0) self.io_frac = spec.get('io_frac', 0)
def __init__(self, spec): Data.__init__(self, spec) self.tag = "process group" self.args = spec.get("args", []) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.ion_kernel = spec.get("ion_kernel", "default") self.ion_kerneloptions = spec.get("ion_kerneloptions", None) self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.umask = spec.get("umask") self.user = spec.get("user", "") self.starttime = spec.get("starttime") self.walltime = spec.get("walltime") self.killtime = spec.get("killtime") self.resid = spec.get("resid", None) self.runid = spec.get("runid", None) self.forker = spec.get("forker", None) self.ranks_per_node = spec.get("ranks_per_node", None) self.subblock = spec.get("subblock", False) self.subblock_parent = spec.get("subblock_parent", None) self.corner = spec.get("corner", None) self.extents = spec.get("extents", None) self.attrs = spec.get("attrs", {})
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.tag = spec.get("tag", "process-group") self.umask = spec.get('umask', 022) self.name = spec.pop("name", None) self.location = spec.pop("location", None) self.state = spec.pop("state", 'running') self.user = spec.pop("user", None) self.stdout = spec.pop("stdout", None) self.stderr = spec.pop("stderr", None) self.cobalt_log_file = spec.get('cobalt_log_file') self.executable = spec.pop("executable", None) self.jobid = spec.pop("jobid", None) self.path = spec.pop("path", None) self.cwd = spec.pop("cwd", None) self.args = spec.pop("args", []) self.env = spec.pop("env", None) self.stdin = spec.pop("stdin", None) self.kerneloptions = spec.pop("kerneloptions", None) self.job_size = spec.pop("size", None) self.id = spec.get("id") self.mpi_system_id = None self.exit_status = None self.log = logging.getLogger('pg') try: tmp_info = pwd.getpwnam(self.user) userid = tmp_info[2] groupid = tmp_info[3] home_dir = tmp_info[5] except KeyError: raise ProcessGroupCreationError, "user/group" if self.stdout is not None: self.outlog = self.stdout else: self.outlog = tempfile.mktemp() if self.stderr is not None: self.errlog = self.stderr else: self.errlog = tempfile.mktemp() self.pid = os.fork() if not self.pid: program = self.executable self.t = tempfile.NamedTemporaryFile() self.t.write("\n".join(self.location) + '\n') self.t.flush() # create a nodefile in /tmp os.environ['COBALT_NODEFILE'] = self.t.name os.environ["COBALT_JOBID"] = str(self.jobid) os.environ["COBALT_PARTNAME"] = self.location[0] os.environ["COBALT_JOBSIZE"] = str(self.job_size) os.environ['USER'] = self.user os.environ['HOME'] = home_dir # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) try: os.setgroups([]) os.setgroups(supplementary_group_ids) except: self.log.error("Failed to set supplementary groups for PG %s", self.jobid, exc_info=True) try: os.setgid(groupid) os.setuid(userid) except OSError: self.log.error("Failed to change userid/groupid for PG %s" % (self.jobid)) sys.exit(0) try: os.umask(self.umask) except: self.log.error("Failed to set umask to %s" % self.umask) try: err = open(self.errlog, 'a') os.dup2(err.fileno(), sys.__stderr__.fileno()) except IOError: self.log.error( "Job %s/%s: Failed to open stderr file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) except OSError: self.log.error( "Job %s/%s: Failed to chmod or dup2 file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) try: out = open(self.outlog, 'a') os.dup2(out.fileno(), sys.__stdout__.fileno()) except IOError: self.log.error( "Job %s/%s: Failed to open stdout file %s. Stdout will be lost" % (self.jobid, self.user, self.outlog)) except OSError: self.log.error( "Job %s/%s: Failed to chmod or dup2 file %s. Stdout will be lost" % (self.jobid, self.user, self.errlog)) cmd = [self.executable, self.executable] + self.args chdir_error = "" try: os.chdir(self.cwd) except: self.log.error("Job %s/%s: unable to set cwd to %s" % (self.jobid, self.user, self.cwd)) chdir_error = "unable to set cwd to %s" % self.cwd try: cobalt_log_file = open(self.cobalt_log_file or "/dev/null", "a") if chdir_error: print >> cobalt_log_file, chdir_error + "\n" print >> cobalt_log_file, "%s\n" % " ".join(cmd[1:]) print >> cobalt_log_file, "called with environment:\n" for key in os.environ: print >> cobalt_log_file, "%s=%s" % (key, os.environ[key]) print >> cobalt_log_file, "\n" cobalt_log_file.close() except: self.log.error("Job %s/%s: unable to open cobaltlog file %s" % (self.jobid, self.user, self.cobalt_log_file)) try: os.execl(*cmd) except Exception, e: self.log.error( "Job %s/%s: Something went wrong in starting the script job." % (self.jobid, self.user), exc_info=1) os._exit(1)
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.tag = spec.get("tag", "process-group") self.umask = spec.get('umask', 022) self.name = spec.pop("name", None) self.location = spec.pop("location", None) self.state = spec.pop("state", 'running') self.user = spec.pop("user", None) self.stdout = spec.pop("stdout", None) self.stderr = spec.pop("stderr", None) self.cobalt_log_file = spec.get('cobalt_log_file') self.executable = spec.pop("executable", None) self.jobid = spec.pop("jobid", None) self.path = spec.pop("path", None) self.cwd = spec.pop("cwd", None) self.args = spec.pop("args", []) self.env = spec.pop("env", None) self.stdin = spec.pop("stdin", None) self.kerneloptions = spec.pop("kerneloptions", None) self.job_size = spec.pop("size", None) self.id = spec.get("id") self.mpi_system_id = None self.exit_status = None self.log = logging.getLogger('pg') try: tmp_info = pwd.getpwnam(self.user) userid = tmp_info[2] groupid = tmp_info[3] home_dir = tmp_info[5] except KeyError: raise ProcessGroupCreationError, "user/group" if self.stdout is not None: self.outlog = self.stdout else: self.outlog = tempfile.mktemp() if self.stderr is not None: self.errlog = self.stderr else: self.errlog = tempfile.mktemp() self.pid = os.fork() if not self.pid: program = self.executable self.t = tempfile.NamedTemporaryFile() self.t.write("\n".join(self.location) + '\n') self.t.flush() # create a nodefile in /tmp os.environ['COBALT_NODEFILE'] = self.t.name os.environ["COBALT_JOBID"] = str(self.jobid) os.environ["COBALT_PARTNAME"] = self.location[0] os.environ["COBALT_JOBSIZE"] = str(self.job_size) os.environ['USER'] = self.user os.environ['HOME'] = home_dir # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) try: os.setgroups([]) os.setgroups(supplementary_group_ids) except: self.log.error("Failed to set supplementary groups for PG %s", self.jobid, exc_info=True) try: os.setgid(groupid) os.setuid(userid) except OSError: self.log.error("Failed to change userid/groupid for PG %s" % (self.jobid)) sys.exit(0) try: os.umask(self.umask) except: self.log.error("Failed to set umask to %s" % self.umask) try: err = open(self.errlog, 'a') os.dup2(err.fileno(), sys.__stderr__.fileno()) except IOError: self.log.error("Job %s/%s: Failed to open stderr file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) except OSError: self.log.error("Job %s/%s: Failed to chmod or dup2 file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) try: out = open(self.outlog, 'a') os.dup2(out.fileno(), sys.__stdout__.fileno()) except IOError: self.log.error("Job %s/%s: Failed to open stdout file %s. Stdout will be lost" % (self.jobid, self.user, self.outlog)) except OSError: self.log.error("Job %s/%s: Failed to chmod or dup2 file %s. Stdout will be lost" % (self.jobid, self.user, self.errlog)) cmd = [self.executable, self.executable] + self.args chdir_error = "" try: os.chdir(self.cwd) except: self.log.error("Job %s/%s: unable to set cwd to %s" % (self.jobid, self.user, self.cwd)) chdir_error = "unable to set cwd to %s" % self.cwd try: cobalt_log_file = open(self.cobalt_log_file or "/dev/null", "a") if chdir_error: print >> cobalt_log_file, chdir_error + "\n" print >> cobalt_log_file, "%s\n" % " ".join(cmd[1:]) print >> cobalt_log_file, "called with environment:\n" for key in os.environ: print >> cobalt_log_file, "%s=%s" % (key, os.environ[key]) print >> cobalt_log_file, "\n" cobalt_log_file.close() except: self.log.error("Job %s/%s: unable to open cobaltlog file %s" % (self.jobid, self.user, self.cobalt_log_file)) try: os.execl(*cmd) except Exception, e: self.log.error("Job %s/%s: Something went wrong in starting the script job." % (self.jobid, self.user), exc_info=1) os._exit(1)