Example #1
0
def entry_func(args=None):
    # Get the script to execute, parse only first input
    parser = get_argparser()
    args = parser.parse_args(args)

    # Here, we wrap the training in a try/except block to ensure that we
    # stop the GPUMonitor process after training, even if an error occurred
    from mpunet.utils.system import GPUMonitor
    gpu_mon = GPUMonitor()
    try:
        run(args=args, gpu_mon=gpu_mon)
    except Exception as e:
        gpu_mon.stop()
        raise e
Example #2
0
def set_gpu_vis(num_GPUs, force_GPU, logger=None):
    """ Helper function that sets the GPU visibility as per parsed args """
    if force_GPU:
        from mpunet.utils.system import set_gpu
        set_gpu(force_GPU)
    else:
        # Automatically determine GPUs to use
        from mpunet.utils.system import GPUMonitor
        GPUMonitor(logger).await_and_set_free_GPU(num_GPUs, stop_after=True)
Example #3
0
def get_free_GPU_sets(num_GPUs, ignore_gpus=None):
    from mpunet.utils.system import GPUMonitor
    mon = GPUMonitor()
    ignore_gpus = _gpu_string_to_list(ignore_gpus or "", as_int=True)
    free_gpus = sorted(mon.free_GPUs, key=lambda x: int(x))
    mon.stop()
    free_gpus = list(filter(lambda gpu: gpu not in ignore_gpus, free_gpus))
    total_GPUs = len(free_gpus)

    if total_GPUs % num_GPUs or not free_gpus:
        if total_GPUs < num_GPUs:
            raise ValueError(
                "Invalid number of GPUs per process '%i' for total "
                "GPU count of '%i' - must be evenly divisible." %
                (num_GPUs, total_GPUs))
        else:
            raise NotImplementedError
    else:
        return _get_GPU_sets(free_gpus, num_GPUs)
Example #4
0
def get_gpu_monitor(num_GPUs, logger):
    """
    Args:
        num_GPUs: Number of GPUs to train on
        logger: A mpunet logger object that will be passed to
                the GPUMonitor

    Returns:
        If num_GPUs >= 0, returns a GPUMonitor object, otherwise returns None
    """
    if num_GPUs >= 0:
        # Initialize GPUMonitor in separate fork now before memory builds up
        from mpunet.utils.system import GPUMonitor
        gpu_mon = GPUMonitor(logger)
    else:
        gpu_mon = None
    return gpu_mon