Beispiel #1
0
    def queryNodeAvg(self, node, minutes=5):
        cpu_rlt, mem_rlt = 0, 0
        start_ts = self.nodes[node]['last_ts'] - minutes * 60
        if node not in self.nodes or not self.nodes[node]:
            logger.warning("queryNodeAvg: Node {} is not in cache".format(
                node, list(self.nodes.keys())))
            return 0
        if start_ts > self.nodes[node][
                'first_ts'] + minutes * 60 * 0.2:  # 20% relax on period inclusion
            logger.warning(
                "queryNodeAvg: Node {} requested period {}- is not completely in cache ({}-{})"
                .format(node, start_ts, self.nodes[node]['first_ts'],
                        self.nodes[node]['last_ts']))
            #return 0      # still return some value

        for uid, user_usage in self.node_user[node].items():
            #logger.debug("\t{}:{}".format(node, self.node_user[node][uid]))
            user = MyTool.getUser(uid)
            idx = bisect_left(user_usage, (start_ts, ))
            seq = user_usage[idx:]

            try:
                cpu_rlt += mean(
                    [usage[InMemCache.CPU_IDX] for (ts, usage) in seq]
                )  #usage is evenly distributed, thus just mean, TODO: have problem when node is down and not sending data
            #mem_rlt += mean([usage[InMemCache.RSS_IDX] for (ts, usage) in seq])      #usage is evenly distributed, thus just mean
            except BaseException as e:
                print("ERROR {} uid={} usage={} start={} idx={} ".format(
                    e, uid, user_usage, start_ts, idx))
        logger.debug("\tnode={}:cpu_rlt={}, mem_rlt={}".format(
            self.nodes[node], cpu_rlt, mem_rlt))
        return cpu_rlt
 def getJobByName_cluster(job_name, cluster, fields):
     start, stop, df = SlurmDBQuery.readJobTable(cluster, fld_lst=fields)
     df = df[df['job_name'] == job_name]
     df['state'] = df['state'].map(lambda x: SLURM_STATE_DICT.get(x, x))
     df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x))
     df['duration'] = df['time_end'] - df['time_start']
     df['duration'] = df['duration'].map(lambda x: x if x > 0 else 0)
     df = df.fillna('Not Defined')
     lst = df.to_dict(orient='records')
     return lst
Beispiel #3
0
def gendata_all(fs, start='', stop='', topN=5):
    if fs not in FileSystems:
        logger.warning(
            "WARNING gendata_all: Unknown file system: {}".format(fs))
        return [], []
    dDict = gendata_fs_history(fs, start, stop)
    if not dDict:
        return [], []
    # get last ts, decide top users based on latest usage
    last_ts = list(sorted(dDict))[-1]
    top_df = dDict[last_ts].nlargest(5, ["fc", "bc"], keep='first')
    top_uid1 = top_df['uid'].tolist()
    top_df = dDict[last_ts].nlargest(5, ["bc", "fc"], keep='first')
    top_uid2 = top_df['uid'].tolist()

    df = pandas.concat(dDict, names=['ts', 'idx'])
    dfg = df.groupby('uid')
    # for each uid, dfg.get_group
    uid2seq1 = []  #{ uid: [(ts, value), ...], ...}
    for uid in top_uid1:
        uidDf = dfg.get_group(uid).reset_index()
        uname = MyTool.getUser(uid, True)
        uid2seq1.append({
            'name': uname,
            'data': uidDf.loc[:, ['ts', 'fc']].values.tolist()
        })

    uid2seq2 = []  #{ uid: [(ts, value), ...], ...}
    for uid in top_uid2:
        uidDf = dfg.get_group(uid).reset_index()
        uname = MyTool.getUser(uid, True)
        uid2seq2.append({
            'name': uname,
            'data': uidDf.loc[:, ['ts', 'bc']].values.tolist()
        })

    return uid2seq1, uid2seq2
Beispiel #4
0
    def queryNode(self, node, start_ts=None, end_ts=None):
        cpu_rlt, mem_rlt, io_rlt = [], [], []

        if node not in self.nodes:
            logger.info("queryNode: Node {} is not in cache".format(
                node, list(self.nodes.keys())))
            return None, [], [], []
        if start_ts and start_ts < self.nodes[node][
                'first_ts'] - 300:  # five minutes gap is allowed
            logger.info(
                "queryNode: Node {} period {}-{} is not completely in cache ({}-{})"
                .format(node, start_ts, end_ts, self.nodes[node]['first_ts'],
                        self.nodes[node]['last_ts']))
            return None, [], [], []
        # else start_ts==None or start_ts >= self.nodes[node]['first_ts']-300

        for uid, user_usage in self.node_user[node].items():
            #logger.debug("\t{}:{}".format(node, self.node_user[node][uid]))
            user = MyTool.getUser(uid)
            seq = user_usage
            if start_ts and start_ts >= self.nodes[node]['first_ts'] - 300:
                idx = bisect_left(seq, (start_ts, ))
                seq = user_usage[idx:]
            if end_ts and end_ts >= self.nodes[node]['last_ts'] - 300:
                idx = bisect_right(seq, (end_ts, ))
                seq = user_usage[:idx - 1]
            cpu_rlt.append({
                'name':
                user,
                'data':
                [[ts * 1000, usage[InMemCache.CPU_IDX]] for (ts, usage) in seq]
            })
            mem_rlt.append({
                'name':
                user,
                'data':
                [[ts * 1000, usage[InMemCache.RSS_IDX]] for (ts, usage) in seq]
            })
            io_rlt.append({
                'name':
                user,
                'data': [[ts * 1000, usage[11]] for (ts, usage) in seq]
            })

        logger.debug("\tnode={}:cpu_rlt={}".format(self.nodes[node], cpu_rlt))
        return self.nodes[node], cpu_rlt, mem_rlt, io_rlt
    def sacct_getReport(
            criteria,
            days=3,
            output='JobID,JobName,AllocCPUS,State,ExitCode,User,NodeList,Start,End',
            skipJobStep=True):
        #print('sacct_getReport {} {} {}'.format(criteria, days, skipJobStep))
        if days:
            t = date.today() + timedelta(days=-days)
            startDate = '%d-%02d-%02d' % (t.year, t.month, t.day)
            criteria = ['-S', startDate] + criteria

        #Constraints has problem
        field_str, sacct_rlt = SlurmCmdQuery.sacctCmd(criteria, output)
        keys = field_str.split(sep='|')
        jobs = []
        jid_idx = keys.index('JobID')
        for line in sacct_rlt:
            ff = line.split(sep='|')
            if (skipJobStep and '.' in ff[jid_idx]):
                continue  # indicates a job step --- under what circumstances should these be broken out?
            #508550_0.extern, 508550_[111-626%20], (array job) 511269+0, 511269+0.extern, 511269+0.0 (?)
            if ('.' in ff[jid_idx]):
                ff0 = ff[jid_idx].split(sep='.')[0]
            else:
                ff0 = ff[jid_idx]

            m = re.fullmatch(r'(\d+)([_\+])(.*)', ff0)
            if not m:
                jid = int(ff0)
            else:
                jid = int(m.group(1))
            if ff[3].startswith('CANCELLED by '):
                uid = ff[3].rsplit(' ', 1)[1]
                uname = MyTool.getUser(uid)
                ff[3] = '%s (%s)' % (ff[3], uname)
            job = dict(zip(keys, ff))
            jobs.append(job)

        if 'AllocTRES' in output:
            for job in jobs:
                job['AllocGPUS'] = MyTool.getTresGPUCount(job['AllocTRES'])

        return jobs
    def getNodeRunJobs(self, node, start, stop):
        df = pandas.read_csv(CSV_DIR + "slurm_cluster_job_table.csv",
                             usecols=[
                                 'id_job', 'id_user', 'nodelist',
                                 'nodes_alloc', 'state', 'time_start',
                                 'time_end', 'time_suspended'
                             ])
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        df = df[df['nodes_alloc'] > 0]

        #jobs running on node
        if node:
            criterion = df['nodelist'].map(lambda x: node in MyTool.nl2flat(x))
            df = df[criterion]
            df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x))

        return df[[
            'id_job', 'user', 'time_start', 'time_end', 'time_suspended'
        ]]
Beispiel #7
0
def display_job_GPU(jid):
    ts = int(time.time())
    job = PyslurmQuery.getCurrJob(jid)
    if not job:
        print("{} Job {} does not exist or already stops running.".format(
            MyTool.getTsString(ts), jid))
        return
    j_gpu = PyslurmQuery.getJobAllocGPU(job)
    #print(j_gpu)
    if not j_gpu:
        print("{} Job {} does not allocate any GPU.".format(
            MyTool.getTsString(ts), jid))
        return

    print("{} Job {} of {} run for {},\talloc {} GPUs on {} GPU nodes.".format(
        MyTool.getTsString(ts), jid, MyTool.getUser(job['user_id']),
        datetime.timedelta(seconds=ts - job['start_time']),
        sum([len(g_lst) for g_lst in j_gpu.values()]),
        sum([1 for g_lst in j_gpu.values() if g_lst])))
    gpu_union = reduce(lambda rlt, curr: rlt.union(set(curr)), j_gpu.values(),
                       set())
    #print(gpu_union)
    gpu_data = BrightRestClient().getGPU(list(j_gpu.keys()),
                                         job['start_time'],
                                         list(gpu_union),
                                         msec=False)
    #print(gpu_data.keys())
    print("\t{:12}{:>6}{:>20}{:>25}".format("Node", "GPU", "Job avg util",
                                            "Avg util (5,10,30min)"))
    for node_name, gpu_list in j_gpu.items():
        for gid in gpu_list:
            g_data = gpu_data['{}.gpu{}'.format(node_name, gid)]
            g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts)
            g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts)
            g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts)
            g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts)
            print("\t{:12}{:6}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".format(
                node_name, gid, g_avg * 100, g_avg1 * 100, g_avg2 * 100,
                g_avg3 * 100))
    return
Beispiel #8
0
def gendata_fs(yyyymmdd, fs, ansible_users={}, anon=False):
    if fs not in FileSystems:
        return 'Unknown file system: {}'.format(fs)

    label, dataDir, suffix, uidx, fcx, bcx, rge = FileSystems[fs]
    ff = sorted(glob.glob(dataDir + '/2*' + suffix))
    idx = 0
    for x, f in enumerate(ff):
        if yyyymmdd in os.path.basename(f):  #filename without dir
            idx = x
            break
    else:
        idx = len(ff) - 1
        logger.info('Date {}:{} not found. Use most recent {} instead.'.format(
            fs, yyyymmdd, ff[-1]))
        yyyymmdd = getDateFromFileName(rge, os.path.basename(ff[-1]))

    #calculate delta and cut_off
    pre = read_file(ff[idx - 1], uidx, [fcx, bcx])
    curr = read_file(ff[idx], uidx, [fcx, bcx])
    delta = {
        k:
        minus_list(curr.get(k, None), pre.get(k, None)) + curr.get(k, [0, 0])
        for k in (set(pre) | set(curr))
    }  #uid: [delta_fc, delta_bc, curr_fc, curr_bc]
    t_dfc = sum([v[0] for v in delta.values()])
    t_dbc = sum([v[1] for v in delta.values()])

    # find N50 wrt file count.
    s = 0
    uid2x = {}
    cutoff = None
    for x, (dfc, uid) in enumerate(
            sorted([(dfc, uid) for uid, [dfc, d, d, d] in delta.items()],
                   reverse=True)):
        s += dfc
        if 2 * s > t_dfc: cutoff = x
        uid2x[uid] = x
    if 4 * x > len(delta): cutoff = 2

    #non_home_user = [(uid, MyTool.getUser(uid)) for uid, v in delta.items() if v[2]==0 or v[3]==0]
    #MyTool.logTmp("{} non_home_user={}".format(fs, non_home_user), time.time())
    r = []
    for uid, v in delta.items():
        if uid < 1000:  # skip
            continue
        if 0 == v[2] or 0 == v[3]:  # curr_f==0 or curr_bc ==0, skip
            continue

        d = {
            'x': v[3],
            'y': v[2],
            'z': log(max(2**20, v[1]), 2) - 19,
            'dfb': v[1],
            'dfc': v[0],
            'id': uid
        }
        uname = MyTool.getUser(uid, fakeName=False)  # slurm user name
        if not uname:  # cannot user
            uname = ansible_users.get(uid, None)  # ansilbe user name
            uname = "User_{}".format(uid) if not uname else uname
            d['name'] = anonimize(uname) if anon else uname
            d['marker'] = {'fillColor': 'rgba(255,225,0,0.5)'}
            r.append(d)
            #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname), 'id':uid, 'marker':{'fillColor': 'rgba(255,225,0,0.5)'}})
        else:
            d['name'] = anonimize(uname) if anon else uname
            if cutoff and uid2x[uid] <= cutoff:
                d['marker'] = {'fillColor': 'rgba(236,124,181,0.9)'}
                #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname), 'marker':{'fillColor': 'rgba(236,124,181,0.9)'}})
            #else:
            #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname)})
            r.append(d)

    return [label, r, yyyymmdd]
Beispiel #9
0
    def getProcsByUser(self, hostname, msg_ts, msg_procs, pre_ts, pre_procs,
                       uid2cpuCnt):

        procsByUser = [
        ]  # [[user, uid, alloc_cores, proc_cnt, totCPURate, totRss, totVMS, procs, totIOBps, totCPUTime]...]
        uid2procs = DDict(
            list
        )  # uid - [[pid, CPURate, create_time, user_time, system_time, rss, vms, cmdline, IOBps]...]
        for pid, proc in msg_procs.items():
            if pid in pre_procs:  # continue proc
                pre_proc = pre_procs[pid]
                c0 = pre_proc['cpu']['user_time'] + pre_proc['cpu'][
                    'system_time']
                i0 = pre_proc['io']['read_bytes'] + pre_proc['io'][
                    'write_bytes']
                d = msg_ts - pre_ts
            else:  # new proc
                c0 = 0.0
                i0 = 0
                d = msg_ts - proc['create_time']
            if d < 0.1:
                logger.warning(
                    "The time period betweeen {} and {} is too small, use 0.1 to calculate the CPU rate"
                    .format(msg_ts, pre_ts))
                d = 0.1
            CPURate = (proc['cpu']['user_time'] + proc['cpu']['system_time'] -
                       c0) / d  #TODO: Replace cheap trick to avoid div0.
            if d < 1:
                logger.warning(
                    "The time period betweeen {} and {} is too small, use 1 to calculate IOBps"
                    .format(msg_ts, pre_ts))
                d = 1
            IOBps = int(
                (proc['io']['read_bytes'] + proc['io']['write_bytes'] - i0) /
                d)

            #add jid 12/09/2019, add io_read, write 12/13/2019
            proc_lst = [
                pid, CPURate, proc['create_time'], proc['cpu']['user_time'],
                proc['cpu']['system_time'], proc['mem']['rss'],
                proc['mem']['vms'], proc['cmdline'], IOBps, proc['jid'],
                proc['num_fds'], proc['io']['read_bytes'],
                proc['io']['write_bytes'], proc['uid']
            ]
            uid2procs[proc['uid']].append(proc_lst)

        # get summary over processes of uid
        for uid, procs in uid2procs.items(
        ):  # proc: [pid, CPURate, create_time, user_time, system_time, rss, vms, cmdline, IOBps]
            totCPUTime = sum([proc[3] + proc[4] for proc in procs])
            totCPURate = sum([proc[1] for proc in procs])
            totRSS = sum([proc[5] for proc in procs])
            totVMS = sum([proc[6] for proc in procs])
            totIOBps = sum([proc[8] for proc in procs])
            procsByUser.append([
                MyTool.getUser(uid), uid,
                uid2cpuCnt.get(uid, 0),
                len(procs), totCPURate, totRSS, totVMS, procs, totIOBps,
                totCPUTime
            ])

        return procsByUser