def __init__(self, hostname, jobspec, name='JobRunner'): super().__init__() self.jobspec = jobspec self.hostname = hostname self.logger = SSLogger(name) # results for parent self.returns = dict()
def __init__(self, algoname='CE', alpha=0.9): self.MIN_DAEMONS = 8 self.net = SSMasterNetwork() self.db = SSDatabase(algorithm=algoname) self.sched = SSScheduler(algoname=algoname, database=self.db) self.default_alpha = alpha self.prtl = SSProtocol() self.logger = SSLogger('Master') self.parser = SSParser() self.users = [] self.daemons = []
def __init__(self, algoname, database): self.logger = SSLogger('Scheduler') if algoname == 'CE': self.algo = SSCEAlgorithm() elif algoname == 'CS': self.algo = SSCSAlgorithm() elif algoname == 'SS': self.algo = SSSSAlgorithm() else: self.logger.error('No such algorithm, use CE/CS/SS.') self.db = database self.logger.succ('Algorithm %s used for resource allocation' % self.algo.name)
def __init__(self, alg='CE'): self.MIN_DAEMONS = 1 self.clock = SimulationClock() self.db = SSDatabase(algorithm=alg, simulationClock=self.clock, logToFile=False) self.sched = SSScheduler(algoname=alg, database=self.db) self.logger = SSLogger('Simulator') self.parser = SSParser() self.users = [] self.daemons = [] self.trace = [] self.pendingJobs = dict() self.runningJobs = dict()
class SSScheduler: def __init__(self, algoname, database): self.logger = SSLogger('Scheduler') if algoname == 'CE': self.algo = SSCEAlgorithm() elif algoname == 'CS': self.algo = SSCSAlgorithm() elif algoname == 'SS': self.algo = SSSSAlgorithm() else: self.logger.error('No such algorithm, use CE/CS/SS.') self.db = database self.logger.succ('Algorithm %s used for resource allocation' % self.algo.name) # the algorithm happens here # return a dictionary: daemon -> jobspec, and the estimation wall time def nextJob(self): # no node or no job, cannot schedule if len(self.db.pendingJobs) and len(self.db.cluster.nodes): jobid = self.db.mostPriorJob() # (parallelism, alpha, dict(scale->{time, ipcs, mbws})) profile = self.db.getProfile(jobid) # the scheduling algorithm decides the order to try different scales # or may only try part of them (CE only tries 1x, E) # data structure of candidate is the same with profile candidates = self.algo.sortCandidates(profile) # self.logger.echo(candidates) # try to allocate for each scale, if success, break allocation, est = None, None for parallelism, scale, mode, alpha, ipcs, mbws, toprofile in candidates: N, C, W, B = self.algo.calculateResourceDemand( parallelism, scale, mode, alpha, ipcs, mbws) if N <= 0: # N<=0 means not feasible continue # resource allocation, if not available (None) # allocation is a dict, daemon -> jobspec (see Protocol) allocation = self.db.allocateFor(jobid, N, C, W, B, scale, mode, toprofile) if allocation: #self.logger.echo(candidates) est = self.algo.estimate(profile, scale, W) self.db.jobStart(jobid, est) break if not allocation: self.db.jobStuck(jobid) return (allocation, est) return (None, None)
def __init__(self, algorithm, simulationClock=None, logToFile=True): # the file use to store history, in json format self.logToFile = logToFile if self.logToFile: self.historyFilename = 'JobLogs/%s_%s_%s_%s.txt' % ( CFG.DB['history_prefix'], 'sim' if simulationClock else 'run', algorithm, datetime.utcnow().strftime('%Y%m%d-%H%M%S')) # the file use to store profile, in json format self.profileFilename = CFG.DB['profile_fname'] # a simulated clock for simulation self.simulationClock = simulationClock self.jobToReq = dict() self.logger = SSLogger('Database') self.cluster = SSCluster() # Job id, inc by one self.jobid = 0 # jobid -> jobattr # jobattr is a dict, e.g., # 'jobname': 'MG16' use to specify executable binary # 'framework': 'MPI' use to build running command # 'parallelism': 16 how many cores needed # 'alpha': a factor indicates tolerable performance loss self.jobidToJobattr = dict() # record the resource a job using self.jobidToResource = dict() # record what daemons a jobid is running on self.jobidToDaemons = dict() # record the returns of a job self.jobidToReturns = dict() # a priority criteria used for scheduling # jobid -> (current priority, stride, last check timestamp) self.jobidToPriority = dict() # profile data for programs # a program is the executable binary of a job, TODO, more accurate signature # currently, we use the jobname 'MG16' as the program signature # self.progToProfile['MG16'] is a dict # profile[scale factor] = {'time': exectution time, 'ipcs': ipc-ways curve, 'mbws': membw-ways curve} self.progToProfile = dict() self.loadProfileFromFile() # three lists: pending, running, finished self.pendingJobs = [] self.runningJobs = [] self.completedJobs = [] # a container to store job history, submit/start/finish/allocation # use jobid as key self.history = dict()
def __init__(self): self.net = SSWorkerNetwork() self.prtl = SSProtocol() self.logger = SSLogger('Daemon') #self.msgLock = threading.Lock() self.jobrunners = [] #self.profiler = None time.sleep(1) # if not wait, will fail to connect, reason unknown self.net.sendObj(self.prtl.greeting('daemon', self.net.hostname)) # I am a daemon
def __init__(self, mode='worker'): self.logger = SSLogger('Network', info=False, echo=False) self.hostname = socket.gethostname() # mode: master or worker self.mode = mode # selector use for non-blocking io self.sel = selectors.DefaultSelector() # connections, clinet -> connection self.connections = dict() # obj buffer for each connection, connection -> object list # buf[0] is the tailing string (without a '#' end flag) # buf[1] and after are completed commands self.objectBuffer = dict() # constant values self.EOC = CFG.NET['eoc'] self.CONNECTION_BROKEN = CFG.NET['broken_conn_str'] self.NEW_CONNECTION = CFG.NET['new_conn_str'] self.SS_MASTER = socket.gethostbyname(CFG.NET['master_hostname']) self.SS_PORT = CFG.NET['master_port'] self.BACK_LOG = CFG.NET['master_backlog'] # master and worker # master connects to all workers, # worker only connects to master, no inter-worker connections if mode == 'master': # master lsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # IPV4 and TCP lsock.bind(('', self.SS_PORT)) # accept from any lsock.listen(self.BACK_LOG) # use selector for non blocking IO, only check READ, assume always writable lsock.setblocking(False) self.sel.register(lsock, selectors.EVENT_READ, data=self.NEW_CONNECTION) self.logger.info('Master started on %s' % socket.gethostname()) else: # worker, both daemons and user frontends sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setblocking(False) sock.connect_ex((self.SS_MASTER, self.SS_PORT)) #data = types.SimpleNamespace(workerName=socket.gethostname()) #workerName = socket.gethostname() self.sel.register(sock, selectors.EVENT_READ, data='master') self.connections['master'] = sock # only connects to master self.objectBuffer[sock] = [''] self.logger.info('Daemon started on %s' % socket.gethostname())
def __init__(self, name): self.name = name self.total_cores = CFG.CLUSTER['core_per_node'] self.total_ways = CFG.CLUSTER['llcway_per_node'] self.total_membw = CFG.CLUSTER['membw_per_node'] self.logger = SSLogger(name='Algorithm')
class SSDatabase: def __init__(self, algorithm, simulationClock=None, logToFile=True): # the file use to store history, in json format self.logToFile = logToFile if self.logToFile: self.historyFilename = 'JobLogs/%s_%s_%s_%s.txt' % ( CFG.DB['history_prefix'], 'sim' if simulationClock else 'run', algorithm, datetime.utcnow().strftime('%Y%m%d-%H%M%S')) # the file use to store profile, in json format self.profileFilename = CFG.DB['profile_fname'] # a simulated clock for simulation self.simulationClock = simulationClock self.jobToReq = dict() self.logger = SSLogger('Database') self.cluster = SSCluster() # Job id, inc by one self.jobid = 0 # jobid -> jobattr # jobattr is a dict, e.g., # 'jobname': 'MG16' use to specify executable binary # 'framework': 'MPI' use to build running command # 'parallelism': 16 how many cores needed # 'alpha': a factor indicates tolerable performance loss self.jobidToJobattr = dict() # record the resource a job using self.jobidToResource = dict() # record what daemons a jobid is running on self.jobidToDaemons = dict() # record the returns of a job self.jobidToReturns = dict() # a priority criteria used for scheduling # jobid -> (current priority, stride, last check timestamp) self.jobidToPriority = dict() # profile data for programs # a program is the executable binary of a job, TODO, more accurate signature # currently, we use the jobname 'MG16' as the program signature # self.progToProfile['MG16'] is a dict # profile[scale factor] = {'time': exectution time, 'ipcs': ipc-ways curve, 'mbws': membw-ways curve} self.progToProfile = dict() self.loadProfileFromFile() # three lists: pending, running, finished self.pendingJobs = [] self.runningJobs = [] self.completedJobs = [] # a container to store job history, submit/start/finish/allocation # use jobid as key self.history = dict() def loadProfileFromFile(self): # if no such file, create one if not os.path.exists(self.profileFilename): with open(self.profileFilename, 'w+') as fw: fw.write('') # in the file, each line is a dict # {'prog': prog, 'scale': scale, 'value': value} # self.progToProfile[prog][scale] = value cnt = 0 with open(self.profileFilename, 'r') as fr: for line in fr.readlines(): if len(line.strip()) == 0: continue kv = json.loads(line) cnt += 1 prog, scale, value = kv['prog'], kv['scale'], kv['value'] if prog not in self.progToProfile: self.progToProfile[prog] = dict() self.progToProfile[prog][scale] = value self.logger.info('Profile Loaded, %d Entries in total.' % cnt) def getTimestampNow(self): if self.simulationClock: return self.simulationClock.now() else: return datetime.utcnow().timestamp() def addDaemon(self, daemon, hostname): self.cluster.addNode(daemon, hostname) self.logger.debug('New daemon:', daemon, 'at', hostname) def addUserJob(self, job): jobid = self.jobid self.logger.debug('Job added [%d]: %s' % (jobid, job)) self.jobidToJobattr[jobid] = job # add to the pending list self.pendingJobs.append(jobid) self.jobidToPriority[jobid] = { 'value': 0, 'stride': CFG.DB['default_stride'], 'lastcheck': self.getTimestampNow() } # record the submit time self.history[self.jobid] = { 'submitTime': self.getTimestampNow(), 'jobattr': job } self.jobid += 1 return jobid def jobStart(self, jobid, est=-1): self.cluster.resourceAlloc(self.jobidToResource[jobid], jobid) self.jobidToDaemons[jobid] = [ x for x, _, _ in self.jobidToResource[jobid] ] self.jobidToReturns[jobid] = [] #self.logger.debug(self.jobidToDaemons) self.pendingJobs.remove(jobid) self.runningJobs.append(jobid) # recover all priority stride for _, p in self.jobidToPriority.items(): p['stride'] = CFG.DB['default_stride'] self.history[jobid]['startTime'] = self.getTimestampNow() self.history[jobid]['estTime'] = est self.logger.info( 'job [%d] (%s) starts, scale %d, resource req:' % (jobid, self.jobidToJobattr[jobid]['jobname'], self.history[jobid]['scale']), self.history[jobid]['NCWB'], ', on nodes:', self.history[jobid]['nodelist'], 'NewProfiling' if self.history[jobid]['toprofile'] else 'InDB') def jobFinish(self, jobid): # record the end time self.history[jobid]['finishTime'] = self.getTimestampNow() jobtime = int(100 * (self.history[jobid]['finishTime'] - self.history[jobid]['startTime'])) / 100 # all returns from all daemons returns = self.jobidToReturns[jobid] # check exitcode, should be 0 for all exitcode = 0 for ret in returns: ec = ret.get('exitcode', 0) if ec != 0: exitcode = ec break # if has estimation (est_time, est_speedup), return the est_time est = self.history[jobid]['estTime'][0] if self.history[jobid][ 'estTime'] else -1 if exitcode != 0: self.logger.error( 'job [%d] (%s) finishes after %.2f seconds (%.2f est), with exitcode %d' % (jobid, self.jobidToJobattr[jobid]['jobname'], jobtime, est, exitcode)) else: self.logger.info( 'job [%d] (%s) finishes after %.2f seconds (%.2f est), with exitcode %d' % (jobid, self.jobidToJobattr[jobid]['jobname'], jobtime, est, exitcode)) # log the execution record if self.logToFile: with open(self.historyFilename, 'a') as fw: fw.write('JOBID %5d: %s\n' % (jobid, json.dumps(self.history[jobid]))) # update the profile if self.history[jobid]['toprofile']: scale = self.history[jobid]['scale'] prog = self.jobidToJobattr[jobid]['jobname'] # profile[scale factor] = {'time': exectution time, 'ipcs': ipc-ways curve, 'mbws': membw-ways curve} if prog not in self.progToProfile: self.progToProfile[prog] = dict() # may be repeated by several concurrent profiling runs, only the first one is used # ?? or use the last one ?? if scale not in self.progToProfile[prog]: wcnt = CFG.CLUSTER['llcway_per_node'] + 1 ipcs = [0] * wcnt mbws = [0] * wcnt ret_cnt = [0] * wcnt # average of all daemons for ret in returns: if 'ipcs' not in ret: continue for w in range(1, wcnt): ipc, mbw = ret['ipcs'][w], ret['mbws'][w] if ipc > 0 and mbw > 0: ipcs[w] += ipc mbws[w] += mbw ret_cnt[w] += 1 for w in range(1, wcnt): ipcs[w] = int(10000 * ipcs[w] / ret_cnt[w]) / 10000 if ret_cnt[w] > 0 else -1 mbws[w] = int(10000 * mbws[w] / ret_cnt[w]) / 10000 if ret_cnt[w] > 0 else -1 self.progToProfile[prog][scale] = { 'time': jobtime, 'ipcs': ipcs, 'mbws': mbws } # log to file with open(self.profileFilename, 'a') as fw: fw.write( json.dumps({ 'prog': prog, 'scale': scale, 'value': self.progToProfile[prog][scale] })) fw.write('\n') self.logger.debug('profile:', self.progToProfile[prog][scale]) # update other data structures self.cluster.resourceFree(self.jobidToResource[jobid]) self.jobidToDaemons.pop(jobid) self.completedJobs.append(jobid) self.runningJobs.remove(jobid) # should receive a message from each daemon, then the job is really completed. def daemonFinishJob(self, dae, jobid, jobreturns): self.jobidToDaemons[jobid].remove(dae) self.jobidToReturns[jobid].append(jobreturns) if len(self.jobidToDaemons[jobid]) == 0: self.jobFinish(jobid) def jobStuck(self, jobid): # decrease its priority stride self.jobidToPriority[jobid]['stride'] = CFG.DB['slow_stride'] def mostPriorJob(self): # update the priority for all jobs now = self.getTimestampNow() for _, p in self.jobidToPriority.items(): p['value'] += p['stride'] * (now - p['lastcheck']) p['lastcheck'] = now # sort pending jobs by their priority (highest first) self.pendingJobs.sort( key=lambda x: self.jobidToPriority[x]['value'] - x, reverse=True) return self.pendingJobs[0] # return all current profile of the program corresponding to jobid def getProfile(self, jobid): attr = self.jobidToJobattr[jobid] prog = attr['jobname'] return (attr['parallelism'], attr['alpha'], self.progToProfile.get(prog, None)) # find allocation (None if not found) # scale and mode are for record in history, the NCWB values already imply them def allocateFor(self, jobid, N, C, W, B, scale, mode, toprofile): # some jobs cannot be scaling out if self.jobidToJobattr[jobid][ 'framework'] == 'TensorFlow': # now we use only single node tf programs if scale != 1: return None # do not allow spread for big jobs. (half machine) if N > 32 and scale > 1 and N / scale > 0.5 * len(self.cluster.nodes): return None # try to allocate resource perNodeReq = {'C': C, 'W': W, 'B': B} resourceAllocation = self.cluster.search(N, perNodeReq) if resourceAllocation: self.jobidToResource[jobid] = resourceAllocation #self.logger.debug('Resource can be allocated for', jobid) alloc = [] affinity = dict() for daemon, _, _ in resourceAllocation: affinity[self.cluster.nodes[daemon] ['hostname']] = self.cluster.nodes[daemon]['core'] nodelist = sorted(affinity.keys()) leadnode = nodelist[0] for daemon, _, _ in resourceAllocation: jobspec = { 'jobid': jobid, 'jobattr': self.jobidToJobattr[jobid], 'coremap': self.cluster.nodes[daemon]['core'], 'llcwaymap': self.cluster.nodes[daemon]['llcway'], 'leadnode': leadnode, 'toprofile': toprofile } #if self.cluster.nodes[daemon]['hostname'] == leadnode: #jobspec['nodelist'] = nodelist jobspec['affinity'] = affinity alloc.append((daemon, jobspec)) self.history[jobid]['allocation'] = alloc self.history[jobid]['nodelist'] = nodelist self.history[jobid]['NCWB'] = (N, C, W, B) self.history[jobid]['scale'] = scale self.history[jobid]['mode'] = mode self.history[jobid]['toprofile'] = toprofile return alloc else: #self.logger.warn('Cannot allocate resource for', jobid) return None
class SSNetwork: def __init__(self, mode='worker'): self.logger = SSLogger('Network', info=False, echo=False) self.hostname = socket.gethostname() # mode: master or worker self.mode = mode # selector use for non-blocking io self.sel = selectors.DefaultSelector() # connections, clinet -> connection self.connections = dict() # obj buffer for each connection, connection -> object list # buf[0] is the tailing string (without a '#' end flag) # buf[1] and after are completed commands self.objectBuffer = dict() # constant values self.EOC = CFG.NET['eoc'] self.CONNECTION_BROKEN = CFG.NET['broken_conn_str'] self.NEW_CONNECTION = CFG.NET['new_conn_str'] self.SS_MASTER = socket.gethostbyname(CFG.NET['master_hostname']) self.SS_PORT = CFG.NET['master_port'] self.BACK_LOG = CFG.NET['master_backlog'] # master and worker # master connects to all workers, # worker only connects to master, no inter-worker connections if mode == 'master': # master lsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # IPV4 and TCP lsock.bind(('', self.SS_PORT)) # accept from any lsock.listen(self.BACK_LOG) # use selector for non blocking IO, only check READ, assume always writable lsock.setblocking(False) self.sel.register(lsock, selectors.EVENT_READ, data=self.NEW_CONNECTION) self.logger.info('Master started on %s' % socket.gethostname()) else: # worker, both daemons and user frontends sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setblocking(False) sock.connect_ex((self.SS_MASTER, self.SS_PORT)) #data = types.SimpleNamespace(workerName=socket.gethostname()) #workerName = socket.gethostname() self.sel.register(sock, selectors.EVENT_READ, data='master') self.connections['master'] = sock # only connects to master self.objectBuffer[sock] = [''] self.logger.info('Daemon started on %s' % socket.gethostname()) # send object to destination # 1. use json to serialize an object to string, only basic python types supported # 2. append EOC to the string, so that object strings can be separated on remote # 3. sendall, we dont send a string in multiple times. sendall is blocking but should work in our case def sendObjTo(self, destination, obj=None): #print('To Send >>', obj) wrapMsg = json.dumps(obj) + self.EOC self.connections[destination].sendall(wrapMsg.encode('utf-8')) # recv an object from anywhere # return value: (source, object received) # 1. pick an object from any buffer and return both the source and the object # 2. check new connection, if any, connect # 3. check new data fron network, if any, buffer it def recvObj(self, timeout=1): # find someone has objects, and return the first pending object for client, conn in self.connections.items(): buf = self.objectBuffer[conn] if len(buf) >= 2: obj = buf.pop(1) if obj == self.CONNECTION_BROKEN: # all the objects from a broken have been received self.logger.info(client, 'lost connection') assert(buf[0] == '') # there should be not tailing incomplete string self.objectBuffer.pop(conn) # remove the entry for broken connection self.connections.pop(client) # also remove the connection return (client, obj) sourcelist = [] # check if something to read from socket events = self.sel.select(timeout=timeout) for key, mask in events: assert(mask & selectors.EVENT_READ) # new connection, only master should receive this if key.data is self.NEW_CONNECTION: assert(self.mode == 'master') sock = key.fileobj conn, addr = sock.accept() conn.setblocking(False) #worker = socket.gethostbyaddr(addr[0])[0] # hostname of new connecting machine #print('accepted connection from', addr) self.sel.register(conn, selectors.EVENT_READ, data=addr) # use addr to distinguish clients # TODO, re-connect workers if addr in self.connections: print('Currently we dont handle this case') assert(False) self.connections[addr] = conn # record new connection self.objectBuffer[conn] = [''] # for new connection event, we don't need to receive data else: sourcelist.append(key.data) # append a source that has data here, key.data is 'addr' of client # now read data from all connections that have data for source in sourcelist: conn = self.connections[source] # entry should be added when connection built assert(conn in self.objectBuffer) buf = self.objectBuffer[conn] # receive whatever it can s = conn.recv(1024).decode('utf-8') # a string that may have <1, =1, >1 dumped objects if len(s) == 0: # connection broken # NOTE !!! DO NOT pop connection from the buffer immediatly since it may have unread objects # DON'T DO THIS: self.commandBuffer.pop(conn) # However, the connection can be unregister self.sel.unregister(conn) # append the broken info to the object buffer buf.append(self.CONNECTION_BROKEN) else: # normal string ss = s.split(self.EOC) # split by EOC, each slice is an object or a pice of an object if len(ss) == 1: # incomplement buf[0] = buf[0] + ss[0] # still incomplete else: buf.append(json.loads(buf[0]+ss[0])) # the incomplete buf[0] now completed, de-serialized for i in range(1, len(ss)-1): # middle slices must tbe completed buf.append(json.loads(ss[i])) buf[0] = ss[-1] # the tail may be incomplete return (None, None) # nothing
class SSJobRunner(threading.Thread): def __init__(self, hostname, jobspec, name='JobRunner'): super().__init__() self.jobspec = jobspec self.hostname = hostname self.logger = SSLogger(name) # results for parent self.returns = dict() # cores[i] = jobid, jobid uses this i-th core # ways[i] = jobid, jobid uses this i-th way # return a string for CAT 'pqos -s; pqos -a', e.g. # sudo pqos -e "llc:1=0xffff0;llc:2=0x0000f" # sudo pqos -a "llc:1=0-7;llc:2=8-27" # bic03,bic04 def getCATString(self, cores, ways): self.logger.debug('cores:', cores) self.logger.debug('ways:', ways) jobids = set(ways) # what jobs are explicitly using LLC if -1 in jobids: jobids.remove(-1) # -1 means no job if len(jobids) == 0: # no CAT, reset. For LLC-unaware policies like CE and CS return [['ssh', 'root@' + self.hostname, 'pqos -R']] # give the spare ways to jobs jobids = list(jobids) for i, jid in enumerate(ways): if jid == -1: ways[i] = jobids[i % len(jobids)] # sort the ways with jobid to make each COS use contigious range ways.sort() # make CAT decision cos = [] for jobid in jobids: cos.append({'cores': [], 'ways': []}) for c, jid in enumerate(cores): if jobid == jid: cos[-1]['cores'].append(c) for w, jid in enumerate(ways): if jobid == jid: cos[-1]['ways'].append(w) pqosE = [] pqosA = [] for i, c in enumerate(cos): s = hex(int(''.join(['1' if x in c['ways'] else '0' for x in range(19,-1,-1)]), 2)) pqosE.append('llc:%d=%s' % (i+1, s)) pqosA.append('llc:%d=%s' % (i+1, ','.join(str(x) for x in c['cores']))) pqosEcmd = 'pqos -e "%s"' % (';'.join(pqosE)) pqosAcmd = 'pqos -a "%s"' % (';'.join(pqosA)) return [['ssh', 'root@' + self.hostname, pqosEcmd], ['ssh', 'root@' + self.hostname, pqosAcmd]] # the command use to launch the program def getLaunchString(self, jobspec): fm = jobspec['jobattr']['framework'] if fm == 'MPI' or fm == 'Spark': if jobspec['leadnode'] != self.hostname: return (None, None) elif fm == 'TensorFlow': assert(len(jobspec['affinity']) == 1) pass affs = dict() for host, corelist in jobspec['affinity'].items(): affs[host] = [] for c, jid in enumerate(corelist): if jobspec['jobid'] == jid: affs[host].append(str(c)) envs = dict() # running commands # format prog-nproc, e.g. mg-16, bfs-32 prog, nproc = jobspec['jobattr']['jobname'].split('-') assert(int(nproc) == jobspec['jobattr']['parallelism']) prog = prog.lower() # env vars for MPI on bic if fm == 'MPI': envs['I_MPI_SHM_LMT'] = 'shm' envs['I_MPI_DAPL_PROVIDER'] = 'ofa-v2-ib0' # env vars for Spark on bic if prog in ['mg', 'lu', 'ep', 'cg']: # four NPB programs exePath = '%s/%s.D.%s' % (CFG.RUN['exe_path']['npb'], prog, nproc) # mpirun -host bic05 -env I_MPI_PIN_PROCESSOR_LIST=1,2,3,4,5,6,7,9 -n 8 ./mg.D.16 : -host bic06 -env I_MPI_PIN_PROCESSOR_LIST=15,16,17,18,19,20,21,22 -n 8 ./mg.D.16 exeCmd = ['mpirun'] for host, corelist in affs.items(): exeCmd.extend(['-host', host, '-env', 'I_MPI_PIN_PROCESSOR_LIST=%s' % ','.join(corelist), '-n', str(len(corelist)), exePath, ':']) exeCmd.pop(-1) elif prog in ['ts', 'wc', 'nw']: # three spark programs spark_master = str(int(jobspec['leadnode'][-2:])) exePath = '%s/%s.sh' % (CFG.RUN['exe_path']['spark'], prog) exeCmd = [exePath, spark_master] for host, corelist in affs.items(): exeCmd.extend([host, str(len(corelist)), ','.join(corelist)]) #self.logger.warn(exeCmd) self.logger.warn('%s: %s' % (prog, ' '.join(exeCmd))) elif prog in ['bfs']: # graph program exePath = CFG.RUN['exe_path'][prog] exeCmd = ['mpirun'] for host, corelist in affs.items(): exeCmd.extend(['-host', host, '-env', 'I_MPI_PIN_PROCESSOR_LIST=%s' % ','.join(corelist), '-n', str(len(corelist)), exePath, '24', '16', ':']) exeCmd.pop(-1) elif prog in ['hc', 'bw']: # two speccpu programs exePath = CFG.RUN['exe_path'][prog] exeCmd = ['mpirun'] for host, corelist in affs.items(): exeCmd.extend(['-host', host, '-env', 'I_MPI_PIN_PROCESSOR_LIST=%s' % ','.join(corelist), '-n', str(len(corelist)), exePath, ':']) exeCmd.pop(-1) # TODO, wrap the jobs and move this cd operation to wrapper scripts os.chdir(CFG.RUN['exe_dir'][prog]) elif prog in ['gan', 'rnn']: # two tensorflow programs exePath = CFG.RUN['exe_path'][prog] assert(len(affs) == 1) # should run on only one node for host, corelist in affs.items(): exeCmd = [exePath, str(len(corelist)), ','.join(corelist)] #self.logger.warn(' '.join(exeCmd)) else: assert(False) return (envs, exeCmd) def getProfileString(self, jobspec): if not jobspec['toprofile'] or jobspec['leadnode'] != self.hostname: return None return [CFG.RUN['deploy_path'] + 'SSmonitor.py'] def run(self): #self.logger.warn(self.jobspec) jobname = self.jobspec['jobattr']['jobname'] self.logger.debug('Run:', jobname) # CAT configuration catCmds = self.getCATString(self.jobspec['coremap'], self.jobspec['llcwaymap']) for catCmd in catCmds: #self.logger.warn('CAT CMD:', ' '.join(catCmd)) #subprocess.run(catCmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(catCmd, stdout=subprocess.DEVNULL) # start the profiler (if needed) profCmd = self.getProfileString(self.jobspec) if profCmd: self.logger.debug('PROF CMD:', ' '.join(profCmd)) #pPorfiler = subprocess.Popen(profCmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) pPorfiler = subprocess.Popen(profCmd, stdout=subprocess.PIPE) # run the executable evns, exeCmd = self.getLaunchString(self.jobspec) if evns: for k, v in evns.items(): os.environ[k] = v if exeCmd: self.logger.debug('EXE CMD:', ' '.join(exeCmd)) pRun = subprocess.run(exeCmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) #pRun = subprocess.run(exeCmd, stdout=subprocess.DEVNULL)#, stderr=subprocess.DEVNULL) self.returns['exitcode'] = pRun.returncode self.logger.debug('EXE Done:', exeCmd) # back to deploy path os.chdir(CFG.RUN['deploy_path']) # terminate the profiler, sort out the result, and return to daemon to be sent to master if profCmd: self.logger.debug('check profile results') pPorfiler.terminate() # llcway ipc mbw ipcs, mbws = [], [] for _ in range(0, 1+CFG.CLUSTER['llcway_per_node']): ipcs.append([]) mbws.append([]) for line in pPorfiler.stdout: ss = line.decode('utf-8').strip().split() #self.logger.debug(line) if len(ss) != 3: break w, ipc, mbw = int(ss[0]), float(ss[1]), float(ss[2]) ipcs[w].append(ipc) mbws[w].append(mbw) for w in range(0, 1+CFG.CLUSTER['llcway_per_node']): ipcs[w] = numpy.average(ipcs[w]) if len(ipcs[w]) else -1 mbws[w] = numpy.average(mbws[w]) if len(mbws[w]) else -1 # linear interpolation # self.logger.warn(CFG.PROF['sample_ways']) for i in range(0, len(CFG.PROF['sample_ways'])-1): cur_w, next_w = CFG.PROF['sample_ways'][i], CFG.PROF['sample_ways'][i+1] #self.logger.warn('cur_w %d, next_w %d' % (cur_w, next_w)) for k in range(min(cur_w, next_w) + 1, max(cur_w, next_w)): #self.logger.warn(k, ipcs) ipcs[k] = ipcs[next_w] + (ipcs[cur_w]-ipcs[next_w])/(cur_w-next_w) * (k-next_w) mbws[k] = mbws[next_w] + (mbws[cur_w]-mbws[next_w])/(cur_w-next_w) * (k-next_w) self.logger.warn(ipcs) self.returns['ipcs'] = ipcs self.returns['mbws'] = mbws self.logger.echo('RETURNS', self.returns)
class SSMaster: def __init__(self, algoname='CE', alpha=0.9): self.MIN_DAEMONS = 8 self.net = SSMasterNetwork() self.db = SSDatabase(algorithm=algoname) self.sched = SSScheduler(algoname=algoname, database=self.db) self.default_alpha = alpha self.prtl = SSProtocol() self.logger = SSLogger('Master') self.parser = SSParser() self.users = [] self.daemons = [] def isclean(self): if len(self.db.pendingJobs) or len(self.db.runningJobs): return False else: return True def addJobSequence(self, jobstring): for n in jobstring.split(','): n = n.strip() fm = None exe = n.split('-')[0] if exe in ['gan', 'rnn']: fm = 'TensorFlow' elif exe in ['ts', 'nw', 'wc']: fm = 'Spark' else: fm = 'MPI' self.db.addUserJob({'jobname': n, 'framework': fm, 'parallelism': int(n[-2:]), 'alpha': self.default_alpha}) def parse(self): self.parser.addRecords(self.parser.loadHistory(self.db.history)) return self.parser.getBasicStats(self.parser.selectRecords()) # the main loop def run(self): # try to get new message client, msg = self.net.recvObj(timeout=1) if client: # acts accordingly # connection broken if msg == self.net.CONNECTION_BROKEN: # client lost if client in self.users: self.users.remove(client) if client in self.daemons: self.logger.error('No handle for daemon lost !!') self.daemons.remove(client) #TODO database remove, scheduler reschedule # normal messages #self.logger.echo(msg) if self.prtl.isgreeting(msg): # new client if msg['role'] == 'user': self.logger.debug('New User from', client) self.users.append(client) # user for interaction elif msg['role'] == 'daemon': self.logger.debug('New Daemon from', client) self.daemons.append(client) # daemon run on each job self.db.addDaemon(client, msg['hostname']) elif self.prtl.isjobfinish(msg): # NOTE, only one daemon of the job finish, need all finish to really finish self.db.daemonFinishJob(client, msg['jobid'], msg['returns']) # wait for all daemons if len(self.daemons) < self.MIN_DAEMONS: return # try to schedule jobs, ignore the estimate time allocation, _ = self.sched.nextJob() #self.logger.debug(allocation) if allocation: for daemon, jobspec in allocation: #self.logger.echo(daemon, jobspec) self.net.sendObjTo(daemon, self.prtl.newjob(jobspec))
class SSSimulator: def __init__(self, alg='CE'): self.MIN_DAEMONS = 1 self.clock = SimulationClock() self.db = SSDatabase(algorithm=alg, simulationClock=self.clock, logToFile=False) self.sched = SSScheduler(algoname=alg, database=self.db) self.logger = SSLogger('Simulator') self.parser = SSParser() self.users = [] self.daemons = [] self.trace = [] self.pendingJobs = dict() self.runningJobs = dict() def isclean(self): if len(self.db.pendingJobs) or len(self.db.runningJobs) or len( self.trace): return False else: return True def addTrace(self, trace): self.trace.extend(trace) self.trace.sort(key=lambda x: x[2]) def loadTrace(self, fname): with open(fname, 'r') as fr: for line in fr.readlines(): program, nproc, submittime, duration = line.strip().split(',') self.trace.append( (program, int(nproc), float(submittime), float(duration))) self.trace.sort(key=lambda x: x[2]) jobs = ', '.join([x[0] for x in self.trace]) self.logger.info('Job trace: ', jobs) #self.logger.echo(self.trace) def addFakeDeamons(self, prefix, cnt): for i in range(0, cnt): fakeDeamon = prefix + str(i) self.daemons.append(fakeDeamon) self.db.addDaemon(fakeDeamon, fakeDeamon) #self.logger.info('Daemons:', self.daemons) # the main loop def run(self, alpha=0.9): done_cnt = 0 next_time = [x[2] for x in self.trace] heapq.heapify(next_time) while not self.isclean(): # new coming job at now # trace is already sorted by submit time while len(self.trace) and self.trace[0][2] <= self.clock.now(): fm = None n = self.trace[0][0] exe = n.split('-')[0] if exe in ['gan', 'rnn']: fm = 'TensorFlow' elif exe in ['ts', 'nw', 'wc']: fm = 'Spark' else: fm = 'MPI' jobid = self.db.addUserJob({ 'jobname': n, 'framework': fm, 'parallelism': self.trace[0][1], 'alpha': alpha }) self.pendingJobs[jobid] = self.trace[0] self.trace.pop(0) #print('put jid %d as pending' % jobid) # if be able to start new job at now # try to schedule jobs, and get its estimated runtime while True: allocation, est = self.sched.nextJob() self.logger.debug(allocation) if allocation: if est is None: print(allocation) assert (est) daemons = [] for daemon, jobspec in allocation: jobid = jobspec['jobid'] daemons.append(daemon) # compute the duration and finish time jt = self.pendingJobs[jobid] # if no estimation time, use the standard duration est_time = est[0] if jt[3] == 0 else jt[3] * est[1] et = self.clock.now() + est_time # runningjob[id] = (finish time, daemons) self.runningJobs[jobid] = (et, list(daemons)) heapq.heappush(next_time, et + 1) else: break # if a job finish at now flag = True while flag: flag = False for jobid, v in self.runningJobs.items(): # job finish if v[0] <= self.clock.now(): for daemon in v[1]: self.db.daemonFinishJob(daemon, jobid, {'exitcode': 0}) self.runningJobs.pop(jobid) done_cnt += 1 if done_cnt % 500 == 0: print('Simulation done for %d jobs' % done_cnt) pass flag = True break if len(next_time): self.clock.ticksto(heapq.heappop(next_time)) else: self.clock.tick() def parse(self): self.parser.addRecords(self.parser.loadHistory(self.db.history)) return self.parser.getBasicStats(self.parser.selectRecords()) def show(self): self.parser.showSchedFig(self.parser.selectRecords())