def slurm_nodes_status(): """ Function issued a sinfo command to get the reasons for down, drained, fail or failing state of a node. Command is sinfo -R --format='%100E|%19H|%30N|%t' Output to parse: Not responding |2020-07-25T22:39:23|skylake106|down* :return: dictionary where nodes names are the keys """ cmd = ["sinfo", "-R", "--format='%100E|%19H|%30N|%t'"] run = " ".join(cmd) data, err = ssh_wrapper(run) if not data: debug("No data received, returning empty dictionary") return {} result = [] for line in data: if ("REASON" or "TIMESTAMP" or "NODELIST" or "STATE") in line: debug("Skipping headline: %s" % line) continue info = line.split("|") if len(info) != 4: error("Wrong format: %s" % line) continue reason = info[0].strip() date = dt.strptime(info[1].strip(), "%Y-%m-%dT%H:%M:%S") node = info[2].strip() stat = info[3].strip() result.append({ "date": date.strftime("%Y-%m-%d %X %Z"), "date_full": date.strftime("%c"), "reason": reason, "status": stat, "node": node }) return result
def web_admin_user_info(): """ Executes linux w command on a remote server and parse the result to be returned as JSON :return: List of dictionaries with user information like: {"username": login, "from": host, "process": cmd} """ data = request.get_json() if not data: raise ValueError("Expecting application/json requests") server = str(data["server"]).strip() if not server: raise ValueError("Server is not defined") result, err = ssh_wrapper("PROCPS_USERLEN=32 PROCPS_FROMLEN=90 w -s -h", host=server) if not result: raise ValueError("Error getting partition information: %s" % err) users = [] for user in result: output = user.split() login = output[0].strip() host = output[2].strip() cmd = " ".join(output[4:]).strip() users.append({"username": login, "from": host, "process": cmd}) return jsonify(data=users)
def get_scratch(): cmd = "beegfs-ctl --getquota --csv --uid %s" % current_user.login result, err = ssh_wrapper(cmd) if not result: raise ValueError("No scratch space info found") info = result[1] name, uid, used, total, files, hard = info.split(",") usage = "{0:.1%}".format(float(used) / float(total)) free = float(total) - float(used) return {"usage": usage, "total": total, "used": used, "free": free, "used_label": bytes2human(used), "free_label": bytes2human(free)}
def slurm_consumption_raw(name, start, finish): """ Build a remote query to SLURM DB to obtain a project's CPU consumption. :param name: Account name, in out case it's a project's name :param start: starting date for accounting query :param finish: end date for accounting query should be now by default :return: Raw result of sreport command """ cmd = ["sreport", "cluster", "AccountUtilizationByUser", "-t", "hours"] cmd += ["-nP", "format=Account,Login,Used", "Accounts=%s" % name] cmd += ["start=%s" % start, "end=%s" % finish] run = " ".join(cmd) data, err = ssh_wrapper(run) if not data: debug("No data received, nothing to return") return None, run debug("Got raw consumption values for project %s: %s" % (name, data)) return data, run
def slurm_partition_info(): result, err = ssh_wrapper("sinfo -s") if not result: raise ValueError("Error getting partition information: %s" % err) partition = [] for record in result: if "PARTITION" in record: continue name, avail, time, nodes, nodelist = record.split() name = name.strip() nodes = nodes.strip() allocated, idle, other, total = nodes.split("/") partition.append({ "name": name, "allocated": allocated, "idle": idle, "other": other, "total": int(total) }) return partition
def get_server_info(server): tmp = {} result, err = ssh_wrapper("uptime && free -m", host=server) if not result: error("Error getting information from the remote server: %s" % err) return tmp uptime_data = memory_data = swap_data = "" for i in result: if "load average" in i: uptime_data = i elif "Mem" in i: memory_data = i elif "Swap" in i: swap_data = i uptime = parse_uptime(uptime_data) swap = parse_swap(swap_data) memory = parse_memory(memory_data) total = dict(list(memory.items()) + list(swap.items())) return {"server": server, "uptime": uptime, "mem": total}
def get_project_conso(name, start, finish): cmd = ["sreport", "cluster", "AccountUtilizationByUser", "-t", "hours"] cmd += ["-nP", "format=Account,Login,Used", "Accounts=%s" % name] cmd += ["start=%s" % start, "end=%s" % finish] run = " ".join(cmd) data, err = ssh_wrapper(run) if not data: debug("No data received, nothing to return") return None result = {} for item in data: item = item.strip() items = item.split("|") if len(items) != 3: continue login = items[1] conso = items[2] if not login: result[name] = int(conso) else: result[login] = int(conso) debug("Project '%s' consumption: %s" % (name, result)) return result
def get_jobs(start, end, last=10): cmd = ["sacct", "-nPX", "--format=JobID,State,Start,Account,JobName,CPUTime,Partition", "--start=%s" % start, "--end=%s" % end, "-u", current_user.login, "|", "sort", "-n", "-r", "|", "head", "-%s" % last] run = " ".join(cmd) result, err = ssh_wrapper(run) if not result: raise ValueError("No jobs found from %s to %s" % (start, end)) jobs = [] for job in result: tmp = {} job = job.strip().split("|") tmp["id"] = job[0] tmp["project"] = job[3] tmp["state"] = job[1] tmp["partition"] = job[6] tmp["date"] = job[2] tmp["name"] = job[4] tmp["duration"] = job[5] jobs.append(tmp) return jobs
def space_info(): """ Run df -h command on a remote server and return parsed information as a list of dictionaries. Dictionary format is: {"filesystem": ..., "size": ..., "used": ..., "available": ..., "use": ..., "mountpoint": ...} :return: List of dict """ result, err = ssh_wrapper("df -h") if not result: raise ValueError("Error getting disk space information: %s" % err) space = [] for record in result: if "Filesystem" in record: continue keywords = [ "/home", "/save", "/trinity/shared", "/scratch", "/scratchfast", "/scratchw" ] filesystem, size, used, avail, use, mountpoint = record.split() if mountpoint.strip() not in keywords: continue space.append({ "filesystem": filesystem.strip(), "size": size.strip(), "used": used.strip(), "available": avail.strip(), "use": use.strip(), "mountpoint": mountpoint.strip() }) return space