コード例 #1
0
def checkGPUsAvailability(n_gpus=1):
    '''
    Test that GPUs have free memory on 'n_gpus'.
    OUT:
        True: if they have
        False: if not
    '''
    # For every gpu to check
    for i_gpu in range(n_gpus):
        
        # Access to the memory used by the i-th gpu
        try:
            nvidia_smi.nvmlInit()
            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i_gpu)
            mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        except Exception:
            print('Warning: GPU did not accessed')
            break
                
        # If more than 1GB is taken, then stop
        if (mem_res.used/(1024.**3) > 1.0):         # greater than 1GB of VRAM
            # Report it
            print('Memory used (gpu-%i): %.2f GB' % (i_gpu, mem_res.used/(1024**3)), end='')
            print(' - on total: %.2f GB' % (mem_res.total/(1024**3)))
            return False

    return True
コード例 #2
0
 def __init__(self, device='cpu'):
     self.log = SummaryWriter()
     if nvidia_smi and device != 'cpu':
         nvidia_smi.nvmlInit()
         self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
     else:
         self.handle = None
コード例 #3
0
    def __init__(self, print_time=60, print_current=False, time_step=0.01):
        # Call the Thread class's init function
        super(utilizationGPU, self).__init__()
        self.print_time = print_time
        self.print_current = print_current
        self.time_step = time_step
        self.GPUs = []
        self.occAvgTot = []
        self.occAvgStep = []
        self.memAvgTot = []
        self.memAvgStep = []
        self.running = True

        try:
            nvmlInit()
            self.deviceCount = nvmlDeviceGetCount()
            # Get list of handles #
            logging.info("[GPU] Detected devices are :")
            for i in range(self.deviceCount):
                handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
                self.GPUs.append(handle)
                logging.info("[GPU] ..... Device %d : %s" %
                             (i, nvmlDeviceGetName(handle)))
                # Records #
                self.occAvgTot.append(0)
                self.occAvgStep.append(0)
                self.memAvgTot.append(0)
                self.memAvgStep.append(0)
            logging.info("[GPU] Will print usage every %d seconds" %
                         self.print_time)
        except Exception as e:
            logging.error("[GPU] *** Caught exception: %s : %s" %
                          (str(e.__class__), str(e)))
            traceback.print_exc()
コード例 #4
0
def get_gpu_memory():
    import nvidia_smi
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    print("Used GPU memory: {}%".format((info.used * 100) // info.total))
    nvidia_smi.nvmlShutdown()
コード例 #5
0
    def __init__(self, gpus=[]):
        r"""CO2 consumption tracker for deep learning models.
        Look at https://arxiv.org/abs/1906.02243 for details.
        """
        # temporal variables
        self._start = None
        self._step = None

        # power variables
        self._cpu_power = 0
        self._gpu_power = 0
        self._ram_power = 0
        self.total_energy = 0

        # GPU-specific constants
        self._cuda = torch.cuda.is_available()
        print(gpus)
        if self._cuda:
            nvidia_smi.nvmlInit()
            self._handles = [
                nvidia_smi.nvmlDeviceGetHandleByIndex(gpu) for gpu in gpus
            ]

        # energy consumption constants
        self._pue_coeff = 1.58
        self._co2_coeff = 0.477
コード例 #6
0
ファイル: train.py プロジェクト: fedogar/Neural_RH_Inversion
    def __init__(self, batch_size, validation_split=0.2, gpu=0, smooth=0.05):
        self.cuda = torch.cuda.is_available()
        self.gpu = gpu
        self.smooth = smooth
        self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu")

        if (NVIDIA_SMI):
            nvidia_smi.nvmlInit()
            self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu)
            print("Computing in {0} : {1}".format(self.device, nvidia_smi.nvmlDeviceGetName(self.handle)))
        
        self.batch_size = batch_size
        self.validation_split = validation_split        
                
        kwargs = {'num_workers': 2, 'pin_memory': False} if self.cuda else {}        
        
        self.model = model.Network(95*3+1, 100, 2).to(self.device)
        
        print('N. total parameters : {0}'.format(sum(p.numel() for p in self.model.parameters() if p.requires_grad)))

        self.dataset = Dataset()
        
        # Compute the fraction of data for training/validation
        idx = np.arange(self.dataset.n_training)

        self.train_index = idx[0:int((1-validation_split)*self.dataset.n_training)]
        self.validation_index = idx[int((1-validation_split)*self.dataset.n_training):]

        # Define samplers for the training and validation sets
        self.train_sampler = torch.utils.data.sampler.SubsetRandomSampler(self.train_index)
        self.validation_sampler = torch.utils.data.sampler.SubsetRandomSampler(self.validation_index)
                
        # Data loaders that will inject data during training
        self.train_loader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.train_sampler, shuffle=False, **kwargs)
        self.validation_loader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.validation_sampler, shuffle=False, **kwargs)
コード例 #7
0
ファイル: task.py プロジェクト: koncle/TSMLDG
 def get_gpu_memory(device_idx):
     assert device_idx < NvidiaSmi.total_devices, "device index should {} less than total devices {}"\
         .format(device_idx, NvidiaSmi.total_devices)
     handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_idx)
     res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
     M = 1024**2
     return res.free / M, res.total / M, res.used / M
コード例 #8
0
def check_gpu_stat():
    nvidia_smi.nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        print(f'gpu{i}: {res.gpu}%, gpu-mem: {res.memory}%')
コード例 #9
0
    def __init__(self,
                 batch_size,
                 validation_split=0.2,
                 gpu=0,
                 smooth=0.05,
                 K=3,
                 model_class='conv1d'):
        self.cuda = torch.cuda.is_available()
        self.gpu = gpu
        self.smooth = smooth
        self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu")
        # self.device = 'cpu'
        self.batch_size = batch_size
        self.model_class = model_class

        self.K = K

        if (NVIDIA_SMI):
            nvidia_smi.nvmlInit()
            self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu)
            print("Computing in {0} : {1}".format(
                self.device, nvidia_smi.nvmlDeviceGetName(self.handle)))

        self.validation_split = validation_split

        kwargs = {'num_workers': 4, 'pin_memory': False} if self.cuda else {}

        if (model_class == 'conv1d'):
            self.model = model.Network(K=self.K,
                                       L=32,
                                       device=self.device,
                                       model_class=model_class).to(self.device)

        if (model_class == 'conv2d'):
            self.model = model.Network(K=self.K,
                                       L=32,
                                       NSIDE=16,
                                       device=self.device,
                                       model_class=model_class).to(self.device)

        print('N. total parameters : {0}'.format(
            sum(p.numel() for p in self.model.parameters()
                if p.requires_grad)))

        self.train_dataset = Dataset(n_training=20000)
        self.validation_dataset = Dataset(n_training=2000)

        # Data loaders that will inject data during training
        self.train_loader = torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)
        self.validation_loader = torch.utils.data.DataLoader(
            self.validation_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)
コード例 #10
0
    def __init__(self, debug_mode=False, challenge_mode=False, track=None, timeout=20.0):
        """
        Init requires scenario as input
        """
        self.scenario = None
        self.scenario_tree = None
        self.scenario_class = None
        self.ego_vehicles = None
        self.other_actors = None

        self._debug_mode = debug_mode
        self._challenge_mode = challenge_mode
        self._track = track
        self._agent = None
        self._running = False
        self._timestamp_last_run = 0.0
        self._timeout = timeout
        self._watchdog = Watchdog(float(self._timeout))

        self.scenario_duration_system = 0.0
        self.scenario_duration_game = 0.0
        self.start_system_time = None
        self.end_system_time = None
        nvidia_smi.nvmlInit()
        self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(1)

        # Register the scenario tick as callback for the CARLA world
        # Use the callback_id inside the signal handler to allow external interrupts
        signal.signal(signal.SIGINT, self._signal_handler)
コード例 #11
0
ファイル: train.py プロジェクト: deepin00/unsupervisedMFBD
    def __init__(self, basis_wavefront='zernike', npix_image=128, n_modes=44, n_frames=10, gpu=0, smooth=0.05,\
        batch_size=16, arguments=None):

        self.pixel_size = 0.0303
        self.telescope_diameter = 256.0  # cm
        self.central_obscuration = 51.0  # cm
        self.wavelength = 8000.0
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.arguments = arguments

        self.basis_for_wavefront = basis_wavefront
        self.npix_image = npix_image
        self.n_modes = n_modes
        self.gpu = gpu
        self.cuda = torch.cuda.is_available()
        self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu")

        # Ger handlers to later check memory and usage of GPUs
        if (NVIDIA_SMI):
            nvidia_smi.nvmlInit()
            self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu)
            print("Computing in {0} : {1}".format(
                gpu, nvidia_smi.nvmlDeviceGetName(self.handle)))

        # Define the neural network model
        print("Defining the model...")
        self.model = model.Network(device=self.device, n_modes=self.n_modes, n_frames=self.n_frames, \
            pixel_size=self.pixel_size, telescope_diameter=self.telescope_diameter, central_obscuration=self.central_obscuration, wavelength=self.wavelength,\
            basis_for_wavefront=self.basis_for_wavefront, npix_image=self.npix_image).to(self.device)

        print('N. total parameters : {0}'.format(
            sum(p.numel() for p in self.model.parameters()
                if p.requires_grad)))

        kwargs = {'num_workers': 1, 'pin_memory': False} if self.cuda else {}
        # Data loaders that will inject data during training
        self.training_dataset = Dataset(
            filename='/scratch1/aasensio/fastcam/training_small.h5',
            n_training_per_star=1000,
            n_frames=self.n_frames)
        self.train_loader = torch.utils.data.DataLoader(
            self.training_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)

        self.validation_dataset = Dataset(
            filename='/scratch1/aasensio/fastcam/validation_small.h5',
            n_training_per_star=100,
            n_frames=self.n_frames,
            validation=True)
        self.validation_loader = torch.utils.data.DataLoader(
            self.validation_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)
コード例 #12
0
def get_mem_info(device_id):
    gpu_list = [device_id]
    nvidia_smi.nvmlInit()
    handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list]
    res = [nvidia_smi.nvmlDeviceGetMemoryInfo(item) for item in handle]
    res = [100 * item.used / item.total for item in res]
    nvidia_smi.nvmlShutdown()
    return res[0]
コード例 #13
0
def show_memory_usage():
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)  # GPU number
    mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    # print('=' * 50)
    # print(f'mem: {mem_res.used / (1024 ** 3)} (GiB)')  # usage in GiB
    print(f'mem usage: {100 * (mem_res.used / mem_res.total):.3f}%'
          )  # percentage
コード例 #14
0
def get_gpu_temp():
    try:
        nvmlInit()
        gpu = nvmlDeviceGetHandleByIndex(0)
        gpu_temp = nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU)
        return gpu_temp
    except NVMLError:
        return None
コード例 #15
0
def print_gpu_info(idx=0):
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(idx)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    print("Total memory:", info.total)
    print("Free memory:", info.free)
    print("Used memory:", info.used)
    nvidia_smi.nvmlShutdown()
コード例 #16
0
def gpu_memory_tracker():
    """returns nvidia gpu memory consumed"""
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    used = info.used
    total = info.total
    percent = used / total * 100
    return percent
コード例 #17
0
ファイル: training.py プロジェクト: tboen1/MESIGAN
def memory_check():
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

    mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    mbs = mem_res.used / (1024**2)
    percent = mem_res.used / mem_res.total
    return mbs, percent
コード例 #18
0
 def get():
     handles = []
     output = []
     for device_id in nvidia_smi.nvmlDeviceGetCount():
         handles.append(nvidia_smi.nvmlDeviceGetHandleByIndex(device_id))
     for handle in handles:
         res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
         output.append({'usage': res.gpu, 'memory': res.memory})
     return output
コード例 #19
0
 def Available_GPUs(self):
     available = []
     for i in range(self.total_gpus):
         handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
         res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
         mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
         if res.gpu < 30 and (mem_res.used / mem_res.total * 100) < 30:
             available.append(i)
     return available
コード例 #20
0
def Watch_fin():
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(1)
    res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    time.sleep(1)
    if res.used == 0:
        return 0
    else:
        return 1
コード例 #21
0
ファイル: monitor.py プロジェクト: r-raymond/batchflow
 def get_usage(gpu_list=None, **kwargs):
     """ Track GPU memory usage. """
     _ = kwargs
     gpu_list = gpu_list or [0]
     nvidia_smi.nvmlInit()
     handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list]
     res = [nvidia_smi.nvmlDeviceGetMemoryInfo(item) for item in handle]
     res = [100 * item.used / item.total for item in res]
     nvidia_smi.nvmlShutdown()
     return res
コード例 #22
0
ファイル: monitor.py プロジェクト: r-raymond/batchflow
 def get_usage(gpu_list=None, **kwargs):
     """ Track GPU memory utilization. """
     _ = kwargs
     gpu_list = gpu_list or [0]
     nvidia_smi.nvmlInit()
     handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list]
     res = [
         nvidia_smi.nvmlDeviceGetUtilizationRates(item) for item in handle
     ]
     return [item.memory for item in res]
コード例 #23
0
def gpu_usage():
    nvidia_smi.nvmlInit()

    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    usage = info.used

    nvidia_smi.nvmlShutdown()
    return usage
コード例 #24
0
def available_GPUs(total_gpus):
    available_gpus = []
    for i in range(total_gpus):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        if res.gpu < 30 and (
                mem_res.used / mem_res.total * 100
        ) < 30:  # Jon heuristically defines what it means for a GPU to be available
            available_gpus.append(i)
    return available_gpus
コード例 #25
0
def use_gpu():
    nvidia_smi.nvmlInit()

    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    nvidia_smi.nvmlShutdown()

    if info.used > 1000000000:
        return True
    else:
        return False
コード例 #26
0
def get_max_data_group_size():
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    total_memory = info.total
    if total_memory >= 12 * (10 ** 9):
        return 2 ** 12
    elif total_memory >= 6 * (10 ** 9):
        return 2 ** 11
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        return 2 ** 12
コード例 #27
0
ファイル: __main__.py プロジェクト: dpanici/DESC
def get_device(gpuID=False):
    """Checks available GPUs and selects the one with the most available memory

    Parameters
    ----------
    gpuID: bool or int
        whether to use GPU, or the device ID of a specific GPU to use. If False,
        use only CPU. If True, attempts to find the GPU with most available memory.

    Returns
    -------
    device : jax.device
        handle to gpu or cpu device selected

    """

    import jax

    if gpuID is False:
        return jax.devices('cpu')[0]

    try:
        gpus = jax.devices('gpu')
        # did the user request a specific GPU?
        if isinstance(gpuID, int) and gpuID < len(gpus):
            return gpus[gpuID]
        if isinstance(gpuID, int):
            from desc.backend import TextColors
            # ID was not valid
            warnings.warn(
                TextColors.WARNING +
                'gpuID did not match any found devices, trying default gpu option'
                + TextColors.ENDC)
        # find all available options and see which has the most space
        import nvidia_smi
        nvidia_smi.nvmlInit()
        maxmem = 0
        gpu = gpus[0]
        for i in range(len(gpus)):
            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
            info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
            if info.free > maxmem:
                maxmem = info.free
                gpu = gpus[i]

        nvidia_smi.nvmlShutdown()
        return gpu

    except:
        from desc.backend import TextColors
        warnings.warn(TextColors.WARNING +
                      'No GPU found, falling back to CPU' + TextColors.ENDC)
        return jax.devices('cpu')[0]
コード例 #28
0
    def on_train_batch_begin(self, batch, logs=None):

        nvidia_smi.nvmlInit()
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
        # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

        res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        res1 = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        #GPUs = GPU.getGPUs()
        #gpu = GPUs[0]

        print(f'gpu: {res.gpu}%, gpu-mem: {res.memory}%')
コード例 #29
0
def check_cuda_memory():
    nvidia_smi.nvmlInit()

    deviceCount = nvidia_smi.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)"\
              .format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, \
                      info.total, info.free, info.used))
    nvidia_smi.nvmlShutdown()
    return
コード例 #30
0
ファイル: get_gpu_info.py プロジェクト: Rowing0914/planet
def get_gpu_info(gpu_id=None):
    """ Get gpu-info regarding gpu_id
    :param gpu_id: gpu bus id
    :return mem_used: used memory in MiB
    :return mem_total: total memory in MiB
    """
    if gpu_id is None:
        gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"])
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(int(gpu_id))
    mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    mem_used = mem_res.used / (1024**2)
    mem_total = mem_res.total / (1024**2)
    return mem_used, mem_total, gpu_id