def get_gpu_info():
    """
    获取 GPU 信息
    :return:
    """

    gpu_list = []
    GPUtil.showUtilization()

    # 获取多个GPU的信息,存在列表里
    for gpu in GPUtil.getGPUs():
        # print('gpu.id:', gpu.id)
        # print('GPU总量:', gpu.memoryTotal)
        # print('GPU使用量:', gpu.memoryUsed)
        # print('gpu使用占比:', gpu.memoryUtil * 100)  # 内存使用率
        # print('gpu load:', gpu.load * 100)  # 使用率
        # 按GPU逐个添加信息
        gpu_list.append({
            "gpu_id": gpu.id,
            "gpu_memoryTotal": gpu.memoryTotal,
            "gpu_memoryUsed": gpu.memoryUsed,
            "gpu_memoryUtil": gpu.memoryUtil * 100,
            "gpu_load": gpu.load * 100
        })

    return gpu_list
Exemple #2
0
    def _create_run_config(self):
        save_checkpoints_steps = self.params[SAVE_CHECKPOINTS_STEPS]
        save_summary_steps = self.params[SAVE_SUMMARY_STEPS]
        keep_checkpoint_max = self.params[KEEP_CHECKPOINT_MAX]

        gpu_options = tf.GPUOptions(allow_growth=True)
        config = tf.ConfigProto(gpu_options=gpu_options)
        f = [
            s.function for s in inspect.stack()
            if ntpath.basename(s.filename) == "thread_handler.py"
            and s.function in [
                "run_thread", "predict_thread", "explain_thread",
                "predict_test_thread"
            ]
        ][-1]

        if f == "predict" or (f != "run" and len(GPUtil.getAvailable()) == 0):
            config.device_count.update({"GPU": 0})

        self.runConfig = tf.estimator.RunConfig(
            model_dir=self.checkpoint_dir,
            save_checkpoints_steps=save_checkpoints_steps,
            save_summary_steps=save_summary_steps,
            keep_checkpoint_max=keep_checkpoint_max,
            session_config=config,
        )
Exemple #3
0
def get_device(cpu_force=False):
    cpu_device = torch.device('cpu')
    if cpu_force or not torch.cuda.is_available():
        print('Using force cpu' if cpu_force else 'There is no available gpus')
        return cpu_device

    gpus = GPUtil.getAvailable(order='memory', maxMemory=0.9, maxLoad=0.9)
    if len(gpus) > 0:
        return torch.device(f'cuda:{gpus[0]}')
    return cpu_device
Exemple #4
0
def single_exp_runner(task_fn,
                      auto_choose_gpu_flag=False,
                      gpu_id: int = 0,
                      seed=None,
                      del_if_log_path_existed=False,
                      keep_session=False,
                      **task_fn_kwargs):
    """

    :param task_fn: task function defined bu users
    :type task_fn: method
    :param auto_choose_gpu_flag: auto choose gpu, default False
    :type auto_choose_gpu_flag: bool
    :param gpu_id: gpu id, default 0
    :type gpu_id: int
    :param seed: seed generated by system time
    :type seed: int
    :param del_if_log_path_existed:delete obsolete log file path if existed, by default False
    :type del_if_log_path_existed: bool
    :param task_fn_kwargs:
    :type task_fn_kwargs:
    :param keep_session: Whether to keep default session & graph
    :type keep_session:
    :return:
    :rtype:
    """
    os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
    if auto_choose_gpu_flag is True:
        DEVICE_ID_LIST = Gpu.getFirstAvailable()
        DEVICE_ID = DEVICE_ID_LIST[0]
        os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    if not seed:
        seed = int(round(time.time() * 1000)) % (2**32 - 1)
    _reset_global_seed(seed, keep_session)
    print("create log path at {}".format(GlobalConfig().DEFAULT_LOG_PATH),
          flush=True)

    file.create_path(path=GlobalConfig().DEFAULT_LOG_PATH,
                     del_if_existed=del_if_log_path_existed)
    Logger().init(config_or_config_dict=dict(),
                  log_path=GlobalConfig().DEFAULT_LOG_PATH,
                  log_level=GlobalConfig().DEFAULT_LOG_LEVEL)
    ConsoleLogger().init(
        to_file_flag=GlobalConfig().DEFAULT_WRITE_CONSOLE_LOG_TO_FILE_FLAG,
        to_file_name=os.path.join(
            GlobalConfig().DEFAULT_LOG_PATH,
            GlobalConfig().DEFAULT_CONSOLE_LOG_FILE_NAME),
        level=GlobalConfig().DEFAULT_LOG_LEVEL,
        logger_name=GlobalConfig().DEFAULT_CONSOLE_LOGGER_NAME)

    task_fn(**task_fn_kwargs)
Exemple #5
0
def startup_fn():
    try:
        from GPUtil import GPUtil
        import math
        import os

        # Disable GPUS if desired
        if "CHIA_CPU_ONLY" in os.environ.keys():
            if os.environ["CHIA_CPU_ONLY"] == "1":
                print("Requested CPU only operation."
                      "Disabling all GPUS via environment variable.")
                os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
                raise ValueError("Requested CPU-only operation.")

        gpus = GPUtil.getGPUs()
        if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
            cuda_filter = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
            print(f"Found CUDA device filter: {cuda_filter}")
            gpus = [gpu for gpu in gpus if str(gpu.id) in cuda_filter]

        available_gpus = []
        for gpu in gpus:
            print(f"Found GPU: {gpu.name}")
            if gpu.memoryUtil < 0.5:
                available_gpus += gpus
            else:
                print("Cannot use this GPU because of its "
                      f"memory utilization @ {int(100.0 * gpu.memoryUtil)}%.")

        if len(available_gpus) > 1:
            print("Only one GPU is supported right now.")
            return False

        if len(available_gpus) < 1:
            print("Need an available GPU!")
            return False

        gpu = available_gpus[0]
        configuration.set_system("gpu0_vram",
                                 math.trunc(gpu.memoryTotal / 102.4) / 10)

    except Exception as ex:
        print(
            f"Could not read available VRAM: {str(ex)}. Setting default value of 4GiB."
        )
        configuration.set_system("gpu0_vram", 4.0)

    print(f"GPU0: {configuration.get_system('gpu0_vram')} Gib VRAM")
    return True
Exemple #6
0
def get_empty_gpu() -> int:
    # Get a GPU
    gpus = None
    sleeps = 0
    while gpus is None or len(gpus) == 0:
        if gpus is not None:
            time.sleep(10)
            sleeps += 1

            if sleeps > 10:
                raise Exception("No available GPU to allocate")
        try:
            gpus = GPUtil.getFirstAvailable(order='random', maxLoad=0.05, maxMemory=0.05)
        except:
            gpus = []

    return gpus[0]
Exemple #7
0
    def _create_run_config(self):
        save_checkpoints_steps = self.params[SAVE_CHECKPOINTS_STEPS]
        save_summary_steps = self.params[SAVE_SUMMARY_STEPS]
        keep_checkpoint_max = self.params[KEEP_CHECKPOINT_MAX]

        gpu_options = tf.GPUOptions(allow_growth=True)
        config = tf.ConfigProto(gpu_options=gpu_options)
        f = [
            s.function for s in inspect.stack()
            if s.filename.split('/')[-1] == 'dfweb.py'
            and s.function != 'check_session'
        ][-1]

        if f == 'predict' or (f != 'run' and len(GPUtil.getAvailable()) == 0):
            config.device_count.update({'GPU': 0})

        self.runConfig = tf.estimator.RunConfig(
            model_dir=self.checkpoint_dir,
            save_checkpoints_steps=save_checkpoints_steps,
            save_summary_steps=save_summary_steps,
            keep_checkpoint_max=keep_checkpoint_max,
            session_config=config)
Exemple #8
0
def get_random_gpu():
    gpus = GPUtil.getAvailable(order='memory', maxMemory=0.9, maxLoad=0.9)
    if len(gpus) > 0:
        return torch.device(f'cuda:{gpus[0]}')
    raise RuntimeError('No gpu found')
    def update(self):
        ram_info = psutil.virtual_memory()
        disk_info = psutil.disk_partitions()
        disk_memory_info = psutil.disk_usage(disk_info[0].mountpoint)
        gpu_info = GPUtil.getGPUs()[0]
        self.config.update({
            'processor': {
                'architecture': platform.processor(),
                'total_cores': psutil.cpu_count(logical=True),
                'max_frequency': f'{psutil.cpu_freq().max}Mhz',
                'current_frequency': f'{psutil.cpu_freq().current:.2f}Mhz',
                'loading': f'{psutil.cpu_percent()}%',
                'usage_per_core': self.get_cpu_per_core(),
            },
            'disk': {
                'file_system_type': disk_info[0].fstype,
                'total_memory': self.get_size(disk_memory_info.total),
                'available': self.get_size(disk_memory_info.free),
                'used': self.get_size(disk_memory_info.used),
                'used_in_percents': f'{disk_memory_info.percent}%',
            },
            'ram': {
                'total_memory': self.get_size(ram_info.total),
                'available': self.get_size(ram_info.available),
                'used': self.get_size(ram_info.used),
                'used_in_percents': f'{ram_info.percent}%',
            },
            'gpu': {
                'name':
                gpu_info.name,
                'temperature':
                f'{gpu_info.temperature}°C',
                'loading':
                f'{gpu_info.load}%',
                'total_memory':
                f'{gpu_info.memoryTotal}MB',
                'available':
                f'{gpu_info.memoryFree}MB',
                'used':
                f'{gpu_info.memoryUsed}MB',
                'used_in_percents':
                f'{gpu_info.memoryUsed / gpu_info.memoryTotal * 100:.2f}%'
            }
        })

        if self.system_name == OSName.LINUX.value:
            self.config['processor'].update({
                'name':
                Computer().processor.name,
                'architecture':
                platform.processor(),
                'temperature':
                f'{psutil.sensors_temperatures()["k10temp"][0].current}°C'
            })

        if self.system_name == OSName.WINDOWS.value:
            self.config['processor'].update({
                'name':
                self.get_processor_name(),
                'architecture':
                str(platform.architecture()[0]),
                'temperature':
                f'Unavailable for OS Windows'
            })
Exemple #10
0
    def __init__(self, token: str):

        os_info = platform.uname()

        self.system_name = os_info.system

        ram_info = psutil.virtual_memory()
        disk_info = psutil.disk_partitions()
        disk_memory_info = psutil.disk_usage(disk_info[0].mountpoint)
        gpu_info = GPUtil.getGPUs()[0]

        self.config = {
            'token': token,
            'os': {
                'name': self.system_name,
                'version': os_info.version,
            },
            'processor': {
                'total_cores': psutil.cpu_count(logical=True),
                'max_frequency': f'{psutil.cpu_freq().max}Mhz',
                'current_frequency': f'{psutil.cpu_freq().current:.2f}Mhz',
                'loading': f'{psutil.cpu_percent()}%',
                'usage_per_core': self.get_cpu_per_core(),
            },
            'socket_info': {
                'host':
                socket.gethostname(),
                'ip_address':
                socket.gethostbyname(socket.gethostname()),
                'mac_address':
                ':'.join(re.findall('..', '%012x' % uuid.getnode())),
            },
            'disk': {
                'file_system_type': disk_info[0].fstype,
                'total_memory': self.get_size(disk_memory_info.total),
                'available': self.get_size(disk_memory_info.free),
                'used': self.get_size(disk_memory_info.used),
                'used_in_percents': f'{disk_memory_info.percent}%',
            },
            'ram': {
                'total_memory': self.get_size(ram_info.total),
                'available': self.get_size(ram_info.available),
                'used': self.get_size(ram_info.used),
                'used_in_percents': f'{ram_info.percent}%',
            },
            'gpu': {
                'name':
                gpu_info.name,
                'temperature':
                f'{gpu_info.temperature}°C',
                'loading':
                f'{gpu_info.load}%',
                'total_memory':
                f'{gpu_info.memoryTotal}MB',
                'available':
                f'{gpu_info.memoryFree}MB',
                'used':
                f'{gpu_info.memoryUsed}MB',
                'used_in_percents':
                f'{gpu_info.memoryUsed / gpu_info.memoryTotal * 100:.2f}%'
            }
        }

        if self.system_name == OSName.LINUX.value:
            self.config['processor'].update({
                'name':
                Computer().processor.name,
                'architecture':
                platform.processor(),
                'temperature':
                f'{psutil.sensors_temperatures()["k10temp"][0].current}°C'
            })

        if self.system_name == OSName.WINDOWS.value:
            self.config['processor'].update({
                'name':
                self.get_processor_name(),
                'architecture':
                platform.architecture()[0],
                'temperature':
                'Unavailable for OS Windows'
            })
Exemple #11
0
if PROJECT.startswith("stackex"):
    reader, train_ds, val_ds, test_ds = load_stackex_data(
        DATA_ROOT, USER_ID_FILE, args.encoder, args.data_type,
        args.max_seq_len, args.toy_data)
else:
    reader, train_ds, val_ds, test_ds = load_br_data(DATA_ROOT, USER_ID_FILE,
                                                     args.encoder,
                                                     args.data_type,
                                                     args.max_seq_len,
                                                     args.toy_data)
num_authors = reader.get_author_num()
print("User Number: ", str(num_authors))

# choose available gpu
if args.device < 0:
    avail_gpus = GPUtil.getFirstAvailable(maxMemory=0.1, maxLoad=0.05)
    # avail_gpus = GPUtil.getFirstAvailable(maxMemory=0.9, maxLoad=0.9)
    if len(avail_gpus) == 0:
        print("No GPU available!!!")
        sys.exit()
    args.device = avail_gpus[0]
    print("### Use GPU", str(args.device), "###")
torch.cuda.set_device(args.device)

# val_size = int(len(train_ds) * args.val_ratio)
# val_ds = train_ds[-val_size:]
# train_ds = train_ds[:-val_size]

# if args.test_ratio != 1:
#     print("Testing ratio:", args.test_ratio)
#     test_size = int(len(test_ds) * args.test_ratio)