Beispiel #1
0
def check_gpus_existence(gpus: List[int]):
    gs = GPUStatCollection.new_query()
    for gpu in gpus:
        try:
            gs.gpus[gpu]
        except KeyError:
            return False
    return True
def gpu_listening_scheduler():
    gs = GPUStatCollection.new_query()
    for gpu in gs.gpus:
        if gpu.index == 0:
            if float(gpu.memory_used)/float(gpu.memory_total) < 0.5 \
                and gpu.utilization < 10:
                print('Restrictions satisfied~')
                os.system("python send_email.py")  # run `send_email.py`
                print('Email sent~')
                break
Beispiel #3
0
def is_single_gpu_totally_free(gpu_index: int):
    gs = GPUStatCollection.new_query()

    if not isinstance(gpu_index, int):
        raise ValueError(f"gpu_index: {gpu_index} is not int")
    if gpu_index >= len(gs.gpus) or gpu_index < 0:
        raise ValueError(f"gpu_index: {gpu_index} does not exist")

    gpu = gs.gpus[gpu_index]
    if len(gpu.processes) <= 0 \
            and gpu.utilization <= 10 \
            and (float(gpu.memory_used) / float(gpu.memory_total) <= 1e-3 or gpu.memory_used < 50):
        return True
    else:
        return False
def get_gpu_status():

    gpus_stats = GPUStatCollection.new_query()
    info = gpus_stats.jsonify()["gpus"]
    gpu_list = []

    mem_ratio_threshold = 0.1  #
    util_ratio_threshold = 10  #
    for idx, each in enumerate(info):
        mem_ratio = each["memory.used"] / each["memory.total"]
        util_ratio = each["utilization.gpu"]
        print(mem_ratio, util_ratio)
        if mem_ratio < mem_ratio_threshold and util_ratio < util_ratio_threshold:
            gpu_list.append(idx)
    print("Scan GPUs to get {} free GPU".format(len(gpu_list)))
    return gpu_list
def monitor(queue: multiprocessing.Queue, info: Dict[str, Any],
            output_dir: str, logging_interval: int) -> None:
    """Monitors hardware resource use as part of a separate process.

    Populate `info` with system specific metrics (GPU, CPU, RAM) at a `logging_interval` interval and saves the output
    in `output_dir`.

    Args:
        queue: queue from which we can push and retrieve messages sent to the child process.
        info: dictionary containing system resource usage information about the parent process.
        output_dir: directory where the contents of `info` will be saved.
        logging_interval: time interval at which we will poll the system for usage metrics.
    """
    for key in info["system"]:
        if "gpu_" in key:
            info["system"][key]["memory_used"] = []
    info["system"]["cpu_utilization"] = []
    info["system"]["ram_utilization"] = []

    while True:
        try:
            message = queue.get(block=False)
            if isinstance(message, str):
                if message == STOP_MESSAGE:
                    save_json(
                        os.path.join(output_dir, info["tag"] + "_temp.json"),
                        info)
                    return
            else:
                queue.put(message)
        except EmptyQueueException:
            pass
        if torch.cuda.is_available():
            gpu_infos = GPUStatCollection.new_query()
            for i, gpu_info in enumerate(gpu_infos):
                gpu_key = f"gpu_{i}"
                info["system"][gpu_key]["memory_used"].append(
                    gpu_info.memory_used)
        info["system"]["cpu_utilization"].append(psutil.cpu_percent())
        info["system"]["ram_utilization"].append(
            psutil.virtual_memory().percent)
        time.sleep(logging_interval)
Beispiel #6
0
 def new_query(self):
     gs = GPUStatCollection.new_query()
     self.gpus = gs.gpus
     self.gs = gs
Beispiel #7
0
def check_req_gpu_num(req_gpu_num: int):
    gs = GPUStatCollection.new_query()
    return req_gpu_num <= len(gs.gpus)