def hostperf2point(self, msg):
     #{'load': [0.29, 0.29, 0.44], 'cpu_times': {'iowait': 553.4, 'idle': 6050244.96, 'user': 12374.76, 'system': 2944.12}, 'proc_total': 798, 'hdr': {'hostname': 'ccalin007', 'msg_process': 'cluster_host_mon', 'msg_type': 'cluster/hostperf', 'msg_ts': 1541096161.66126}, 'mem': {'available': 196645462016, 'used': 8477605888, 'cached': 3937718272, 'free': 192701874176, 'total': 201179480064, 'buffers': 5869568}, 'net_io': {'rx_err': 0, 'rx_packets': 6529000, 'rx_bytes': 5984570284, 'tx_err': 0, 'tx_drop': 0, 'tx_bytes': 6859935273, 'tx_packets': 6987776, 'rx_drop': 0}, 'proc_run': 1, 'disk_io': {'write_bytes': 7890793472, 'read_count': 130647, 'write_count': 221481, 'read_time': 19938, 'read_bytes': 2975410176, 'write_time': 6047344}}
     ts    = msg['hdr']['msg_ts']
     point           = {'measurement':'cpu_load', 'time': (int)(ts)}
     point['tags']   = {'hostname'   : msg['hdr']['hostname']}
     point['fields'] = MyTool.flatten(MyTool.sub_dict(msg, ['cpu_times', 'mem', 'net_io', 'disk_io']))
     point['fields'].update ({'load_1min':msg['load'][0], 'load_5min':msg['load'][1], 'load_15min':msg['load'][2], 'proc_total':msg['proc_total'], 'proc_run': msg['proc_run']})
     return [point]
 def createProcInfoPoint (self, node, proc, end_time=None):
     #change 01/17/2021: change to one_month.node_proc_info, hostname+jid as tag
     point                     = {'measurement':'node_proc_info2', 'time': (int)(proc['create_time'])}
     point['tags']             = {'hostname':node, 'pid': proc['pid']}
     point['fields']           = MyTool.flatten(MyTool.sub_dict(proc, ['ppid', 'uid', 'jid', 'name', 'cmdline']))
     if end_time:
        point['fields']['end_time'] = end_time
        point['fields']['status']   = proc['status']
        point['fields'].update (MyTool.flatten(MyTool.sub_dict(proc, ['mem', 'io', 'num_fds', 'cpu'], default=0)))
     return point
    def getUserReport_hourly(cluster, start='', stop='', top=5, account=None):
        # get top 5 user for each resource
        fname = "{}/{}_{}".format(CSV_DIR, cluster,
                                  "assoc_usage_day_table.csv")
        df = pandas.read_csv(
            fname,
            usecols=['id', 'id_tres', 'alloc_secs', 'time_start'],
            dtype={'time_start': int})
        st, stp, df = MyTool.getDFBetween(df, 'time_start', start,
                                          stop)  #constrain by time
        sumDf = df.groupby(['id_tres', 'id']).sum()  #sum over user
        fname1 = "{}/{}_{}".format(CSV_DIR, cluster, "assoc_table.csv")
        userDf = pandas.read_csv(fname1,
                                 usecols=['id_assoc', 'user', 'acct'],
                                 index_col=0)
        sumDf = sumDf.join(userDf, on='id')
        if account:
            sumDf = sumDf[sumDf['acct'] == account]
        cpuIdx = sumDf.loc[(1, )].nlargest(top, 'alloc_secs').index
        memIdx = sumDf.loc[(2, )].nlargest(top, 'alloc_secs').index
        nodeIdx = sumDf.loc[(4, )].nlargest(top, 'alloc_secs').index
        gpuIdx = sumDf.loc[(1001, )].nlargest(top, 'alloc_secs').index

        #refine top users' data using hour_table
        fname2 = "{}/{}_{}".format(CSV_DIR, cluster,
                                   "assoc_usage_hour_table.csv")
        df = pandas.read_csv(
            fname2, usecols=['id', 'id_tres', 'time_start', 'alloc_secs'])
        st, stp, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        # get top users data only
        dfg = df.groupby(['id_tres', 'id'])
        tresSer = {
            1: [],
            2: [],
            4: [],
            1001: []
        }  # {1: [{'data': [[ms,value],...], 'name': uid},...], 2:...}
        idxSer = {1: cpuIdx, 2: memIdx, 4: nodeIdx, 1001: gpuIdx}
        for tres in [1, 2, 4, 1001]:
            for uid in idxSer[tres]:
                topDf = dfg.get_group((tres, uid))
                topDf['ts_ms'] = topDf['time_start'] * 1000
                topDf['alloc_ratio'] = topDf['alloc_secs'] / 3600
                topLst = topDf[['ts_ms', 'alloc_ratio']].values.tolist()
                tresSer[tres].append({
                    'data':
                    topLst,
                    'name':
                    userDf.loc[uid, 'user'] + "(" + userDf.loc[uid, 'acct'] +
                    ")"
                })

        return st, stp, tresSer
    def slurmReservation2point (self, ts, name, item, points):
#{'andras_test': {'accounts': [], 'burst_buffer': [], 'core_cnt': 28, 'end_time': 1591885549, 'features': [], 'flags': 'MAINT,SPEC_NODES', 'licenses': {}, 'node_cnt': 1, 'node_list': 'worker1010', 'partition': None, 'start_time': 1560349549, 'tres_str': ['cpu=28'], 'users': ['root', 'apataki', 'ifisk', 'carriero', 'ntrikoupis']}}
        MyTool.remove_dict_empty(item)

        point           = {'measurement':'slurm_reservation', 'time': ts}
        point['tags']   = {'name': name}
        point['fields'] = item

        MyTool.dict_complex2str(point['fields'])
        points.append(point)

        return points
    def slurmQOS2point (self, ts, name, item, points):
#{'description': 'cca', 'flags': 0, 'grace_time': 0, 'grp_jobs': 4294967295, 'grp_submit_jobs': 4294967295, 'grp_tres': '1=6000', 'grp_tres_mins': None, 'grp_tres_run_mins': None, 'grp_wall': 4294967295, 'max_jobs_pu': 4294967295, 'max_submit_jobs_pu': 4294967295, 'max_tres_mins_pj': None, 'max_tres_pj': None, 'max_tres_pn': None, 'max_tres_pu': '1=840', 'max_tres_run_mins_pu': None, 'max_wall_pj': 10080, 'min_tres_pj': None, 'name': 'cca', 'preempt_mode': 'OFF', 'priority': 15, 'usage_factor': 1.0, 'usage_thres': 4294967295.0}

        MyTool.remove_dict_empty(item)

        point           = {'measurement':'slurm_qos', 'time': ts}
        point['tags']   = {'name': item.pop('name')}
        point['fields'] = item

        MyTool.dict_complex2str(point['fields'])
        points.append(point)

        return points
Example #6
0
def display_node_GPU(node_name):
    ts = int(time.time())
    node = PyslurmQuery.getNode(node_name)
    if not node:
        print("{} Node {} does not exist.".format(MyTool.getTsString(ts),
                                                  node_name))
        return
    if 'gpu' not in node['features']:
        print("{} Node {} does not have GPUs.".format(MyTool.getTsString(ts),
                                                      node_name))
        return

    jobs = PyslurmQuery.getNodeAllocJobs(node_name, node)
    gpu_total, gpu_used = MyTool.getGPUCount(node['gres'], node['gres_used'])
    print("{}: Node {} up {},\t{} GPUs ({} used), {} allocated jobs.".format(
        MyTool.getTsString(ts), node_name,
        datetime.timedelta(seconds=ts - node['boot_time']), gpu_total,
        gpu_used,
        len(jobs) if jobs else 0))

    jid2gpu = dict(
        map(
            lambda job:
            (job['job_id'], PyslurmQuery.getJobAllocGPUonNode(job, node)),
            jobs))
    if jid2gpu:
        job_gpu = reduce(operator.add, jid2gpu.values())
        start_ts = min(
            [job['start_time'] for job in jobs if job['gres_detail']])
        gpu_data = BrightRestClient().getNodeGPU(node_name,
                                                 start_ts,
                                                 job_gpu,
                                                 msec=False)
    else:
        gpu_data = {}

    jid2job = dict(map(lambda job: (job['job_id'], job), jobs))
    gid2jid = defaultdict(list)
    for jid, gpu_list in jid2gpu.items():
        for gid in gpu_list:
            gid2jid[gid].append(jid)
    print("\t{:6}{:10}{:>20}{:>20}{:>25}".format("GPU", "Jid", "Job run time",
                                                 "Job avg util",
                                                 "Avg util (5,10,30min)"))
    for gid in range(0, gpu_total):
        jid_list = gid2jid[gid]
        if jid_list:
            start_ts = min([jid2job[jid]['start_time'] for jid in jid_list])
            g_data = gpu_data['{}.gpu{}'.format(node_name, gid)]
            g_avg = MyTool.getTimeSeqAvg(g_data, start_ts, ts)
            g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts)
            g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts)
            g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts)
            print("\t{:<6}{:10}{:>20}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".
                  format(gid, str(jid_list),
                         str(datetime.timedelta(seconds=ts - start_ts)),
                         g_avg * 100, g_avg1 * 100, g_avg2 * 100,
                         g_avg3 * 100))
        else:
            print("\t{:<6}{:10}".format(gid, "IDLE"))
    def slurmPartition2point (self, ts, name, item, points):
#{'allow_accounts': 'ALL', 'deny_accounts': None, 'allow_alloc_nodes': 'ALL', 'allow_groups': ['cca'], 'allow_qos': ['gen', 'cca'], 'deny_qos': None, 'alternate': None, 'billing_weights_str': None, 'cr_type': 0, 'def_mem_per_cpu': None, 'def_mem_per_node': 'UNLIMITED', 'default_time': 604800, 'default_time_str': '7-00:00:00', 'flags': {'Default': 0, 'Hidden': 0, 'DisableRootJobs': 0, 'RootOnly': 0, 'Shared': 'EXCLUSIVE', 'LLN': 0, 'ExclusiveUser': 0}, 'grace_time': 0, 'max_cpus_per_node': 'UNLIMITED', 'max_mem_per_cpu': None, 'max_mem_per_node': 'UNLIMITED', 'max_nodes': 'UNLIMITED', 'max_share': 0, 'max_time': 604800, 'max_time_str': '7-00:00:00', 'min_nodes': 1, 'name': 'cca', 'nodes': 'worker[1000-1239,3000-3191]', 'over_time_limit': 0, 'preempt_mode': 'OFF', 'priority_job_factor': 1, 'priority_tier': 1, 'qos_char': 'cca', 'state': 'UP', 'total_cpus': 14400, 'total_nodes': 432, 'tres_fmt_str': 'cpu=14400,mem=264000G,node=432,billing=14400'}

        MyTool.remove_dict_empty(item)

        point           = {'measurement':'slurm_partition_0618', 'time': ts}
        name            = item.pop('name')
        point['tags']   = {'name':name}
        point['fields'] = item

        MyTool.dict_complex2str(point['fields'])
        points.append(point)

        return points
Example #8
0
    def queryNodeAvg(self, node, minutes=5):
        cpu_rlt, mem_rlt = 0, 0
        start_ts = self.nodes[node]['last_ts'] - minutes * 60
        if node not in self.nodes or not self.nodes[node]:
            logger.warning("queryNodeAvg: Node {} is not in cache".format(
                node, list(self.nodes.keys())))
            return 0
        if start_ts > self.nodes[node][
                'first_ts'] + minutes * 60 * 0.2:  # 20% relax on period inclusion
            logger.warning(
                "queryNodeAvg: Node {} requested period {}- is not completely in cache ({}-{})"
                .format(node, start_ts, self.nodes[node]['first_ts'],
                        self.nodes[node]['last_ts']))
            #return 0      # still return some value

        for uid, user_usage in self.node_user[node].items():
            #logger.debug("\t{}:{}".format(node, self.node_user[node][uid]))
            user = MyTool.getUser(uid)
            idx = bisect_left(user_usage, (start_ts, ))
            seq = user_usage[idx:]

            try:
                cpu_rlt += mean(
                    [usage[InMemCache.CPU_IDX] for (ts, usage) in seq]
                )  #usage is evenly distributed, thus just mean, TODO: have problem when node is down and not sending data
            #mem_rlt += mean([usage[InMemCache.RSS_IDX] for (ts, usage) in seq])      #usage is evenly distributed, thus just mean
            except BaseException as e:
                print("ERROR {} uid={} usage={} start={} idx={} ".format(
                    e, uid, user_usage, start_ts, idx))
        logger.debug("\tnode={}:cpu_rlt={}, mem_rlt={}".format(
            self.nodes[node], cpu_rlt, mem_rlt))
        return cpu_rlt
Example #9
0
    def getJobRequestHistory(self, st, et):
        query = "select * from autogen.slurm_job_mon1 where time >= " + str(
            int(st)) + "000000000 and time <= " + str(int(et)) + "000000000"
        results = self.query(query, epoch='ms')
        points = list(results.get_points())

        runJidSet = set()  #use set to remove duplicate ids
        pendJidSet = set()  #use set to remove duplicate ids
        ts2ReqNodeCnt = defaultdict(int)
        ts2ReqCPUCnt = defaultdict(int)
        ts2PendReqNodeCnt = defaultdict(int)
        ts2PendReqCPUCnt = defaultdict(int)
        for point in points:
            ts = point['time']
            jid = point['job_id']
            tres_dict = MyTool.str2dict(point.get('tres_req_str', None))

            if point['job_state'] in ['RUNNING']:  # running state
                runJidSet.add(jid)
                if tres_dict:
                    ts2ReqNodeCnt[ts] += int(tres_dict.get('node', 1))
                    ts2ReqCPUCnt[ts] += int(
                        tres_dict.get('cpu', point.get('num_cpus', 28)))
            elif point['job_state'] in ['PENDING']:  # pending state
                pendJidSet.add(jid)
                if tres_dict:
                    ts2PendReqNodeCnt[ts] += int(tres_dict.get('node', 1))
                    ts2PendReqCPUCnt[ts] += int(
                        tres_dict.get('cpu', point.get('num_cpus', 28)))

        return runJidSet, ts2ReqNodeCnt, ts2ReqCPUCnt, pendJidSet, ts2PendReqNodeCnt, ts2PendReqCPUCnt
    def sum_assoc_usage_day(cluster):
        # read in one year's usage table
        fname = "{}/{}_{}".format(CSV_DIR, cluster,
                                  "assoc_usage_day_table.csv")
        df = pandas.read_csv(fname, dtype={'time_start': int})
        start = int(time.time()) - 365 * 24 * 3600  # 1 years' history
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, None)

        # join with user
        fname1 = "{}/{}_{}".format(CSV_DIR, cluster, "assoc_table.csv")
        userDf = pandas.read_csv(fname1,
                                 usecols=['id_assoc', 'user', 'acct'],
                                 index_col=0)
        rlt = df.join(userDf, on='id')
        rlt.to_csv("{}/{}_{}".format(
            CSV_DIR, cluster, "assoc_usage_day_1year_combine_table.csv"),
                   index=False)

        # get summary data
        rlt = rlt[['id_tres', 'user', 'alloc_secs']]
        dfg = rlt.groupby(['id_tres', 'user'])
        sum_df = dfg.sum()
        df_lst = []
        for idx in [1, 2, 4, 1001]:  #cpu, mem, node, gpu
            tres_df = sum_df.loc[idx, ]
            tres_df = tres_df.sort_values('alloc_secs', ascending=False)
            tres_df = tres_df.reset_index('user')
            tres_df['id_tres'] = idx
            tres_df['rank'] = tres_df.index + 1
            df_lst.append(tres_df)
        sum_df = pandas.concat(df_lst, ignore_index=True)
        sum_df.to_csv("{}/{}_{}".format(CSV_DIR, cluster,
                                        "assoc_usage_day_1year_sum_table.csv"),
                      index=False)
 def __init__ (self, savFile='node2pids.cache', test_mode=False):
     self.filename    = savFile
     self.test_mode   = test_mode
     sav              = MyTool.readFile(savFile)
     if sav:
        self.node2TsPids = DDict(lambda:DDict(), sav)
     else:
        self.node2TsPids = DDict(lambda: DDict())  #{node: {curr_ts: curr_pids: pre_ts: pre_pids:}}
Example #12
0
def test4():
    app = InfluxQueryClient()
    start, stop = MyTool.getStartStopTS(days=1)
    rlt = app.getNodeJobProcData('worker5057', 951025, start)
    with open("tmp.out") as f:
        for item in rlt:
            print("{}\n".format(item))
            json.dump(item, f)
    def slurmNode2point (self, ts, item, points):
#{'arch': 'x86_64', 'boards': 1, 'boot_time': 1560203329, 'cores': 14, 'core_spec_cnt': 0, 'cores_per_socket': 14, 'cpus': 28, 'cpu_load': 2, 'cpu_spec_list': [], 'features': 'k40', 'features_active': 'k40', 'free_mem': 373354, 'gres': ['gpu:k40c:1', 'gpu:k40c:1'], 'gres_drain': 'N/A', 'gres_used': ['gpu:k40c:0(IDX:N/A)', 'mic:0'], 'mcs_label': None, 'mem_spec_limit': 0, 'name': 'workergpu00', 'node_addr': 'workergpu00', 'node_hostname': 'workergpu00', 'os': 'Linux 3.10.0-957.10.1.el7.x86_64 #1 SMP Mon Mar 18 15:06:45 UTC 2019', 'owner': None, 'partitions': ['gpu'], 'real_memory': 384000, 'slurmd_start_time': 1560203589, 'sockets': 2, 'threads': 1, 'tmp_disk': 0, 'weight': 1, 'tres_fmt_str': 'cpu=28,mem=375G,billing=28,gres/gpu=2', 'version': '18.08', 'reason': None, 'reason_time': None, 'reason_uid': None, 'power_mgmt': {'cap_watts': None}, 'energy': {'current_watts': 0, 'base_consumed_energy': 0, 'consumed_energy': 0, 'base_watts': 0, 'previous_consumed_energy': 0}, 'alloc_cpus': 0, 'err_cpus': 0, 'state': 'IDLE', 'alloc_mem': 0}
#REBOOT state, boot_time and slurmd_start_time is 0

        # slurm_node_mon: ts, name
        point           = {'measurement':'slurm_node_mon2', 'time': ts}  #03/19/2021, replace slurm_node_mon1, 03/23/2020, replace slurm_node_mon
        point['tags']   = {'hostname': item['node_hostname']} 
        point['fields'] = MyTool.sub_dict_nonempty (item, ['name', 'boot_time', 'slurmd_start_time', 'cpus', 'cpu_load', 'alloc_cpus', 'state', 'free_mem', 'gres', 'gres_used', 'partitions', 'reason', 'reason_time', 'reason_uid', 'err_cpus', 'alloc_mem'])
        MyTool.dict_complex2str(point['fields'])
        points.append(point)

        # slurm_jobs: slurmd_start_time
        if (('boot_time' in item) and (not MyTool.emptyValue(item['boot_time']))):
           infopoint           = {'measurement':'slurm_node', 'time': (int)(item['boot_time'])}
           infopoint['tags']   = {'hostname': item['node_hostname']}
           infopoint['fields'] = MyTool.sub_dict_nonempty (item, ['name', 'slurmd_start_time', 'cpus', 'partitions', 'arch', 'boards', 'cores', 'features', 'gres', 'node_addr', 'os', 'sockets', 'threads', 'tres_fmt_str'])
           MyTool.dict_complex2str(infopoint['fields'])
      
           newValue = json.dumps(infopoint)
           name     = item['name']
           if (name not in self.sav_node_dict) or (self.sav_node_dict[name] != newValue):
              points.append(infopoint)
              self.sav_node_dict[name] = newValue

        return points
Example #14
0
 def getJobAllocGPU(job, node_dict=None):
     if not node_dict:
         node_dict = pyslurm.node().get()
     if not job['cpus_allocated']:
         return None
     node_list = [node_dict[node] for node in job['cpus_allocated']]
     gpus_allocated = MyTool.getGPUAlloc_layout(node_list,
                                                job['gres_detail'])
     return gpus_allocated
Example #15
0
    def getJobMonData_hc(self, jid, start_time=None, stop_time=None):
        d = self.getJobMonData(jid, start_time, stop_time)
        if not d:
            return None

        # save data suitable for highchart
        cpu_rlt, mem_rlt, ior_rlt, iow_rlt = [], [], [], []
        for node, points in d.items():
            cpu_rlt.append({
                'name':
                node,
                'data': [[
                    p['time'],
                    MyTool.getDictNumValue(p, 'cpu_system_util') +
                    MyTool.getDictNumValue(p, 'cpu_user_util')
                ] for p in points]
            })
            mem_rlt.append({
                'name':
                node,
                'data': [[p['time'],
                          MyTool.getDictNumValue(p, 'mem_rss_K')]
                         for p in points]
            })
            ior_rlt.append({
                'name':
                node,
                'data':
                [[p['time'],
                  MyTool.getDictNumValue(p, 'io_read_rate')] for p in points]
            })
            iow_rlt.append({
                'name':
                node,
                'data':
                [[p['time'],
                  MyTool.getDictNumValue(p, 'io_write_rate')] for p in points]
            })

        # get min, max timestamp of cpu_rlt and return them as the values for all result
        minTS = min([n['data'][0][0] for n in cpu_rlt if n['data']])
        maxTS = max([n['data'][-1][0] for n in cpu_rlt if n['data']])

        return minTS, maxTS, cpu_rlt, mem_rlt, ior_rlt, iow_rlt
Example #16
0
    def sacct_getReport(
            criteria,
            days=3,
            output='JobID,JobName,AllocCPUS,State,ExitCode,User,NodeList,Start,End',
            skipJobStep=True):
        #print('sacct_getReport {} {} {}'.format(criteria, days, skipJobStep))
        if days:
            t = date.today() + timedelta(days=-days)
            startDate = '%d-%02d-%02d' % (t.year, t.month, t.day)
            criteria = ['-S', startDate] + criteria

        #Constraints has problem
        field_str, sacct_rlt = SlurmCmdQuery.sacctCmd(criteria, output)
        keys = field_str.split(sep='|')
        jobs = []
        jid_idx = keys.index('JobID')
        for line in sacct_rlt:
            ff = line.split(sep='|')
            if (skipJobStep and '.' in ff[jid_idx]):
                continue  # indicates a job step --- under what circumstances should these be broken out?
            #508550_0.extern, 508550_[111-626%20], (array job) 511269+0, 511269+0.extern, 511269+0.0 (?)
            if ('.' in ff[jid_idx]):
                ff0 = ff[jid_idx].split(sep='.')[0]
            else:
                ff0 = ff[jid_idx]

            m = re.fullmatch(r'(\d+)([_\+])(.*)', ff0)
            if not m:
                jid = int(ff0)
            else:
                jid = int(m.group(1))
            if ff[3].startswith('CANCELLED by '):
                uid = ff[3].rsplit(' ', 1)[1]
                uname = MyTool.getUser(uid)
                ff[3] = '%s (%s)' % (ff[3], uname)
            job = dict(zip(keys, ff))
            jobs.append(job)

        if 'AllocTRES' in output:
            for job in jobs:
                job['AllocGPUS'] = MyTool.getTresGPUCount(job['AllocTRES'])

        return jobs
    def getNodeRunJobs(self, node, start, stop):
        df = pandas.read_csv(CSV_DIR + "slurm_cluster_job_table.csv",
                             usecols=[
                                 'id_job', 'id_user', 'nodelist',
                                 'nodes_alloc', 'state', 'time_start',
                                 'time_end', 'time_suspended'
                             ])
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        df = df[df['nodes_alloc'] > 0]

        #jobs running on node
        if node:
            criterion = df['nodelist'].map(lambda x: node in MyTool.nl2flat(x))
            df = df[criterion]
            df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x))

        return df[[
            'id_job', 'user', 'time_start', 'time_end', 'time_suspended'
        ]]
 def getJobByName_cluster(job_name, cluster, fields):
     start, stop, df = SlurmDBQuery.readJobTable(cluster, fld_lst=fields)
     df = df[df['job_name'] == job_name]
     df['state'] = df['state'].map(lambda x: SLURM_STATE_DICT.get(x, x))
     df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x))
     df['duration'] = df['time_end'] - df['time_start']
     df['duration'] = df['duration'].map(lambda x: x if x > 0 else 0)
     df = df.fillna('Not Defined')
     lst = df.to_dict(orient='records')
     return lst
Example #19
0
 def getGPUNodes(pyslurmNodes):
     #TODO: need to change max_gpu_cnt if no-GPU node add other gres
     gpu_nodes = [
         n_name for n_name, node in pyslurmNodes.items()
         if 'gpu' in node['features']
     ]
     max_gpu_cnt = max([
         MyTool.getNodeGresGPUCount(pyslurmNodes[n]['gres'])
         for n in gpu_nodes
     ])
     return gpu_nodes, max_gpu_cnt
    def sendJobNotice (self, ts, job):
        user   =MyTool.getUserStruct(int(job['user_id']))
        groups =MyTool.getUserGroups(user.pw_name)
        if 'scc' in groups:
           return
        
        userName     = user.pw_name
        userFullName = user.pw_gecos.split(',')[0]
        addr   ='http://*****:*****@flatironinstitute.org'.format(userName)]
        to_list=RECIPIENTS

        msg = EmailMessage()
        msg.set_content(content)
        msg['Subject'] = 'Long runnig job with low utilization at slurm cluster -- Job {} by {}'.format(job['job_id'], userName)
        msg['From']    = 'yliu'
        msg['To']      = ', '.join(to_list)
        
        with smtplib.SMTP('smtp-relay.gmail.com') as s:
            s.send_message(msg)
    def slurmJob2point (self, ts, item, points):
    #{'account': 'scc', 'accrue_time': 'Unknown', 'admin_comment': None, 'alloc_node': 'rusty1', 'alloc_sid': 3207927, 'array_job_id': None, 'array_task_id': None, 'array_task_str': None, 'array_max_tasks': None, 'assoc_id': 153, 'batch_flag': 0, 'batch_features': None, 'batch_host': 'worker1085', 'billable_tres': 28.0, 'bitflags': 1048576, 'boards_per_node': 0, 'burst_buffer': None, 'burst_buffer_state': None, 'command': None, 'comment': None, 'contiguous': False, 'core_spec': None, 'cores_per_socket': None, 'cpus_per_task': 1, 'cpus_per_tres': None, 'cpu_freq_gov': None, 'cpu_freq_max': None, 'cpu_freq_min': None, 'dependency': None, 'derived_ec': '0:0', 'eligible_time': 1557337982, 'end_time': 1588873982, 'exc_nodes': [], 'exit_code': '0:0', 'features': [], 'group_id': 1023, 'job_id': 240240, 'job_state': 'RUNNING', 'last_sched_eval': '2019-05-08T13:53:02', 'licenses': {}, 'max_cpus': 0, 'max_nodes': 0, 'mem_per_tres': None, 'name': 'bash', 'network': None, 'nodes': 'worker1085', 'nice': 0, 'ntasks_per_core': None, 'ntasks_per_core_str': 'UNLIMITED', 'ntasks_per_node': 0, 'ntasks_per_socket': None, 'ntasks_per_socket_str': 'UNLIMITED', 'ntasks_per_board': 0, 'num_cpus': 28, 'num_nodes': 1, 'partition': 'scc', 'mem_per_cpu': False, 'min_memory_cpu': None, 'mem_per_node': True, 'min_memory_node': 0, 'pn_min_memory': 0, 'pn_min_cpus': 1, 'pn_min_tmp_disk': 0, 'power_flags': 0, 'preempt_time': None, 'priority': 4294877910, 'profile': 0, 'qos': 'gen', 'reboot': 0, 'req_nodes': [], 'req_switch': 0, 'requeue': False, 'resize_time': 0, 'restart_cnt': 0, 'resv_name': None, 'run_time': 4308086, 'run_time_str': '49-20:41:26', 'sched_nodes': None, 'shared': '0', 'show_flags': 23, 'sockets_per_board': 0, 'sockets_per_node': None, 'start_time': 1557337982, 'state_reason': 'None', 'std_err': None, 'std_in': None, 'std_out': None, 'submit_time': 1557337982, 'suspend_time': 0, 'system_comment': None, 'time_limit': 'UNLIMITED', 'time_limit_str': 'UNLIMITED', 'time_min': 0, 'threads_per_core': None, 'tres_alloc_str': 'cpu=28,mem=500G,node=1,billing=28', 'tres_bind': None, 'tres_freq': None, 'tres_per_job': None, 'tres_per_node': None, 'tres_per_socket': None, 'tres_per_task': None, 'tres_req_str': 'cpu=1,node=1,billing=1', 'user_id': 1022, 'wait4switch': 0, 'wckey': None, 'work_dir': '/mnt/home/apataki', 'cpus_allocated': {'worker1085': 28}, 'cpus_alloc_layout': {'worker1085': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]}}
        # remove empty values
        job_id = item['job_id']
        MyTool.remove_dict_empty(item)
        for v in ['run_time_str', 'time_limit_str']: item.pop(v, None)

        # pending_job
        if item['job_state'] == 'PENDING':
           pendpoint          = {'measurement':'slurm_pending', 'time': ts} 
           pendpoint['tags']  = MyTool.sub_dict       (item, ['state_reason', 'job_id'])
           pendpoint['fields']= MyTool.sub_dict       (item, ['user_id', 'submit_time', 'account', 'qos', 'partition', 'tres_req_str', 'last_sched_eval', 'time_limit', 'start_time']) #switch from tres_per_node to tres_req_str 06/28/2019
           points.append(pendpoint)

        # slurm_job_mon: ts, job_id
        point =  {'measurement':'slurm_job_mon1', 'time': ts}  #03/23/2020 change
        point['tags']   = MyTool.sub_dict_remove       (item, ['job_id'])
        point['fields'] = MyTool.sub_dict_exist_remove (item, ['user_id', 'job_state', 'state_reason', 'run_time', 'suspend_time', 'num_cpus', 'num_nodes', 'tres_req_str']) # add tres_req_str 06/28/2019
        points.append(point)

        # slurm_job_info: submit_time, job_id, user_id
        #infopoint = {'measurement':'slurm_job', 'time': (int)(item.pop('submit_time'))}
        #infopoint['tags']   = MyTool.sub_dict (point['tags'], ['job_id', 'user_id'])
        #infopoint['fields'] = item
        #infopoint['fields'].update (MyTool.sub_dict(point['fields'], ['job_state', 'state_reason', 'num_cpus', 'num_nodes', 'tres_req_str','tres_alloc_str']))
        #MyTool.dict_complex2str(infopoint['fields'])
        #newValue = json.dumps(infopoint)
        #if (job_id not in self.sav_job_dict) or (self.sav_job_dict[job_id] != newValue):
        #   points.append(infopoint)
        #   self.sav_job_dict[job_id] = newValue
        #else:
        #   logger.debug("duplicate job info for {}".format(job_id))
        #points.append(infopoint)
        return points
Example #22
0
    def getSlurmNodeMonData(self, node, start_time, stop_time=''):
        query = "select * from autogen.cpu_uid_mon where hostname = '{}'".format(
            node)
        query = self.extendQuery(query, start_time, stop_time)
        results = self.query(query)
        if results:
            points = list(results.get_points())
            uid2seq = {}
            for point in points:  #points are sorted by point['time']
                ts = point['time']
                uid = point['uid']
                if uid not in uid2seq: uid2seq[uid] = {}
                if 'mem_rss_K' in point:
                    mem_rss_K = MyTool.getDictNumValue(point, 'mem_rss_K')
                else:
                    mem_rss_K = int(
                        MyTool.getDictNumValue(point, 'mem_rss') / 1024)
                uid2seq[uid][ts] = [
                    MyTool.getDictNumValue(point, 'cpu_system_util') +
                    MyTool.getDictNumValue(point, 'cpu_user_util'), mem_rss_K,
                    MyTool.getDictNumValue(point, 'io_read_rate'),
                    MyTool.getDictNumValue(point, 'io_write_rate')
                ]

            if len(points) > 0:
                start_time = points[0]['time']
                stop_time = points[len(points) - 1]['time']
                return uid2seq, start_time, stop_time

        return None, start_time, stop_time
Example #23
0
    def getSlurmUidMonData_All(self, uid, start_time, stop_time=''):
        if stop_time:  # if stop_time equal to today, set it to '' to retrive up to date data
            if stop_time == time.mktime(date.today().timetuple()
                                        ):  # mostly from report page to today
                stop_time = ''
        cut_ts = int(time.time()) - 3 * ONE_DAY_SECS
        if start_time < cut_ts:  # more than 3 days
            node2seq = self.getSlurmUidMonData_Hourly(uid, start_time,
                                                      cut_ts - 1)
            start_time = cut_ts

        query = "select * from autogen.cpu_uid_mon where uid='{}'".format(uid)
        query = self.extendQuery(query, start_time, stop_time)
        results = self.query(query, 'ms')
        points = results.get_points()  # lists of dictionaries
        for point in points:  #points are sorted by point['time']
            ts = point['time']
            node = point['hostname']
            if 'mem_rss_K' in point:
                mem_rss_K = MyTool.getDictNumValue(point, 'mem_rss_K')
            else:
                mem_rss_K = int(
                    MyTool.getDictNumValue(point, 'mem_rss') / 1024)
            node2seq[node][ts] = [
                MyTool.getDictNumValue(point, 'cpu_system_util') +
                MyTool.getDictNumValue(point, 'cpu_user_util'), mem_rss_K,
                MyTool.getDictNumValue(point, 'io_read_rate'),
                MyTool.getDictNumValue(point, 'io_write_rate')
            ]
        return node2seq
    def sum_job_step(cluster, days=30):
        start = int(time.time()) - days * ONE_DAY_SECS
        step_df = SlurmDBQuery.readClusterTable(cluster, 'step_table', [
            'job_db_inx', 'id_step', 'user_sec', 'user_usec', 'sys_sec',
            'sys_usec', 'time_start'
        ])
        s1, s2, step_df = MyTool.getDFBetween(step_df, 'time_start', start)
        dfg = step_df.groupby('job_db_inx')
        sum_df = dfg.sum()
        sum_df.insert(
            0, 'total_cpu', sum_df.user_sec + sum_df.sys_sec +
            (sum_df.user_sec + sum_df.sys_sec) / 1000000)
        sum_df = sum_df[['total_cpu']]
        #sum_df  = sum_df.astype(int)     # will lost int with join because of missing data
        #print("sum_df={}".format(sum_df))

        job_df = SlurmDBQuery.readClusterTable(cluster, 'job_table')
        s1, s2, job_df = MyTool.getDFBetween(job_df, 'time_start', start)
        comb_df = job_df.join(sum_df, on='job_db_inx')
        comb_df.to_csv("{}/{}_{}".format(CSV_DIR, cluster,
                                         "job_step_sum_table.csv"),
                       index=False)  # job_df is more than sum_df
Example #25
0
 def getJobGPUNodes(jobs, pyslurmNodes):
     gpu_nodes = reduce(lambda rlt, curr: rlt.union(curr), [
         set(job['gpus_allocated'].keys()) for job in jobs.values()
         if 'gpus_allocated' in job and job['gpus_allocated']
     ], set())
     if gpu_nodes:
         max_gpu_cnt = max([
             MyTool.getNodeGresGPUCount(pyslurmNodes[n]['gres'])
             for n in gpu_nodes
         ])
     else:
         max_gpu_cnt = 0
     return gpu_nodes, max_gpu_cnt
    def readJobTable(cluster,
                     start=None,
                     stop=None,
                     fld_lst=None,
                     index_col=None,
                     time_col='time_submit'):
        f_name = "{}/{}_{}".format(CSV_DIR, cluster, "job_table.csv")
        df = pandas.read_csv(f_name, usecols=fld_lst, index_col=index_col)
        if time_col and (start or stop):
            logger.debug("start={},stop={}".format(start, stop))
            start, stop, df = MyTool.getDFBetween(df, time_col, start, stop)

        return start, stop, df
    def createProcMonPoint (self, node, ts, proc):
        #point['fields']           = MyTool.flatten(MyTool.sub_dict(proc, ['create_time', 'jid', 'mem', 'io', 'num_fds', 'cpu'], default=0))
        #change 01/23/2020: change name from cpu_proc_mon to node_proc_mon
        #                   remove uid from tags
        #change 01/17/2021: change to one_month.node_proc_mon, hostname+jid as tag
        #change 01/19/2021: change to one_month.node_proc_mon, hostname+jid+pid as tag
        
        #point                     = {'measurement':'node_proc_mon1', 'time': ts}
        #point['tags']             = {'hostname':node, 'jid':proc['jid'], 'pid':proc['pid']}
        #point['fields']           = MyTool.flatten(MyTool.sub_dict(proc, ['uid', 'io', 'num_fds', 'cpu'], default=0))
        point                     = {'measurement':'node_proc_mon2', 'time': ts}
        point['tags']             = {'hostname':node, 'pid':proc['pid']}
        point['fields']           = MyTool.flatten(MyTool.sub_dict(proc, ['uid', 'jid', 'io', 'num_fds', 'cpu'], default=0))
        point['fields']['status'] = querySlurm.SlurmStatus.getStatusID(proc['status'])
        point['fields']['mem_data']  = round(proc['mem']['data']   / 1024)
        point['fields']['mem_rss']   = round(proc['mem']['rss']    / 1024) 
        point['fields']['mem_shared']= round(proc['mem']['shared'] / 1024)
        point['fields']['mem_text']  = round(proc['mem']['text']   / 1024)
        point['fields']['mem_vms']   = round(proc['mem']['vms']    / 1024)
        if 'cpu_affinity' in point['fields']:
           point['fields'].pop('cpu_affinity')

        return point
    def sendNotice (self, ts, jobs):
        curr_jids = set(jobs.keys())
        last_jids = set(self.last_jobNotice.keys())
        
        dup_jids  = curr_jids.intersection(last_jids)
        rmv_jids  = last_jids.difference(dup_jids)
        new_jids  = curr_jids.difference(dup_jids)

        #remove the job notice that is not repeated this time, if job's utilization is fluctrated around border-line->more than expected dup notices
        for jid in rmv_jids:
            self.last_jobNotice.pop(jid)
        #send notice for new appeared job
        for jid in new_jids:
            self.sendJobNotice (ts, jobs[jid])
            self.last_jobNotice[jid] = ts
        #send notice for dup job if long interval passed
        dup_jids_send = list(filter(lambda x: ts - self.last_jobNotice[x] > self.interval_secs, dup_jids))
        #print('{}: send notice for {} conditionly ({})'.format(ts, dup_jids_send, dup_jids))
        for jid in dup_jids_send:
            self.sendJobNotice(ts, jobs[jid])
            self.last_jobNotice[jid] = ts

        MyTool.writeFile (self.cacheFile, self.last_jobNotice)
    def getUserDoneJobReport(
        uid,
        cluster='slurm',
        days=3,
        output='JobID,JobIDRaw,JobName,AllocCPUS,AllocTRES,State,ExitCode,User,NodeList,Start,End'
    ):
        if days > 30:  # only 30 days of history is saved
            return None
        job_df = SlurmDBQuery.readClusterTable(cluster, 'job_step_sum_table')
        start = int(time.time()) - days * ONE_DAY_SECS
        s1, s2, job_df = MyTool.getDFBetween(job_df, 'time_start', start)

        user_job_df = job_df[job_df['id_user'] == uid]
        return user_job_df
 def readClusterTableBetween(cluster,
                             part_table_name,
                             fld_lst,
                             start=None,
                             stop=None,
                             index_col=None,
                             ts_col=None):
     df = SlurmDBQuery.readClusterTable(cluster, part_table_name, fld_lst,
                                        index_col)
     if ts_col:
         start, stop, df = MyTool.getDFBetween(df, ts_col, start, stop)
         return start, stop, df
     else:
         return 0, 0, df