Beispiel #1
0
 def check_devices():
     for i in range(device_count()):
         print("Found device {}:".format(i), get_device_name(i))
     if device_count() == 0:
         print("No GPU device found")
     else:
         print("Current cuda device is", get_device_name(current_device()))
Beispiel #2
0
 def _gpu_descriptor(self, gpu):
     if gpu == 'auto':
         if cuda.device_count() == 0:
             return False
         if self.supports_multiple_gpus():
             return list(range(cuda.device_count()))
         return True
     return gpu
    def machine_params(cls, mode="train", **kwargs) -> MachineParams:
        """Return the number of processes and gpu_ids to use with training."""
        num_gpus = cuda.device_count()
        has_gpu = num_gpus != 0

        sampler_devices = None
        if mode == "train":
            nprocesses = cls.num_train_processes() if torch.cuda.is_available(
            ) else 1
            devices = (list(range(min(nprocesses, num_gpus)))
                       if has_gpu else [torch.device("cpu")])
        elif mode == "valid":
            devices = [num_gpus - 1] if has_gpu else [torch.device("cpu")]
            nprocesses = 2 if has_gpu else 0
        else:
            nprocesses = 20 if has_gpu else 1
            devices = (list(range(min(nprocesses, num_gpus)))
                       if has_gpu else [torch.device("cpu")])

        nprocesses = split_processes_onto_devices(nprocesses=nprocesses,
                                                  ndevices=len(devices))

        return MachineParams(
            nprocesses=nprocesses,
            devices=devices,
            sampler_devices=sampler_devices,
            sensor_preprocessor_graph=cls.resnet_preprocessor_graph(
                mode=mode) if cls.USE_RESNET_CNN else None,
        )
Beispiel #4
0
def main():
    os.chdir(os.path.dirname(__file__))
    args = get_arguments()
    constr_weight = get_constraint(args.weight_bits, 'weight')
    constr_activation = get_constraint(args.activation_bits, 'activation')
    if args.dataset == 'cifar10':
        network = resnet20
        dataloader = dataloader_cifar
    else:
        if args.network == 'resnet18':
            network = resnet18
        elif args.network == 'resnet50':
            network = resnet50
        else:
            print('Not Support Network Type: %s' % args.network)
            return
        dataloader = dataloader_imagenet
    train_loader = dataloader(args.data_root,
                              split='train',
                              batch_size=args.batch_size)
    test_loader = dataloader(args.data_root,
                             split='test',
                             batch_size=args.batch_size)
    net = network(quan_first_last=args.quan_first_last,
                  constr_activation=constr_activation,
                  preactivation=args.preactivation)

    model_path = os.path.join(args.model_root, args.model_name + '.pth')
    name_weights_old = torch.load(model_path)
    name_weights_new = net.state_dict()
    name_weights_new.update(name_weights_old)
    net.load_state_dict(name_weights_new)
    add_lsqmodule(net, constr_weight)
    print(net)
    net = net.cuda()
    net = nn.DataParallel(net, device_ids=range(cuda.device_count()))

    quan_activation = isinstance(constr_activation, np.ndarray)
    postfix = '_w' if not quan_activation else '_a'
    new_model_name = args.prefix + args.model_name + '_lsq' + postfix
    cache_root = os.path.join('.', 'cache')
    train_loger = LogHelper(new_model_name, cache_root, quan_activation,
                            args.resume)
    optimizer, lr_scheduler = get_optimizer(net=net,
                                            optimizer=args.optimizer,
                                            lr_base=args.learning_rate,
                                            weight_decay=args.weight_decay,
                                            lr_scheduler=args.lr_scheduler,
                                            total_epoch=args.total_epoch,
                                            quan_activation=quan_activation)
    trainer = Trainer(net=net,
                      train_loader=train_loader,
                      test_loader=test_loader,
                      optimizer=optimizer,
                      lr_scheduler=lr_scheduler,
                      model_name=new_model_name,
                      train_loger=train_loger)
    trainer(total_epoch=args.total_epoch,
            save_check_point=True,
            resume=args.resume)
Beispiel #5
0
 def get_gpu_mem(self):
     if cutorch.is_available():
         return sum([
             cutorch.memory_cached(i) for i in range(cutorch.device_count())
         ])
     else:
         return 0
Beispiel #6
0
def main(config_path):
    path_to_config = Path(config_path)

    if not (path_to_config.exists()):
        raise ValueError('{} doesn\'t exist'.format(path_to_config))
    elif path_to_config.suffix.lower(
    ) != '.json' or not path_to_config.is_file():
        raise ValueError('{} is not .json config file'.format(path_to_config))

    model_configs = load_json(path_to_config)

    path_to_data = Path(model_configs['path_to_data'])
    train_model = model_configs['train_model']
    workers_num = model_configs['workers_num']
    batch_size = model_configs['batch_size']
    data_loaders = get_data_loaders(path_to_data, batch_size, workers_num,
                                    train_model)

    model = DeepLabV3Plus(model_configs['output_classes'])

    device = 'cpu'
    device_count = 0
    if cuda.is_available() and model_configs['cuda_usage']:
        device = 'cuda'
        device_count = cuda.device_count()

    if device is not 'cpu' and device_count > 1:
        model = nn.DataParallel(model).cuda()
    elif device is not 'cpu':
        model = model.cuda()

    criterion = None
    metric = None
    optimizer = optim.SGD(model.parameters(),
                          lr=model_configs['learning_rate'],
                          momentum=0.9)

    info_paths = model_configs['info_paths']

    writer = SummaryWriter(log_dir=info_paths['log_dir'])
    total_epochs = model_configs['epochs']

    for epoch in range(total_epochs):
        model.train()
        train(model,
              data_loaders['train'],
              epoch,
              optimizer,
              criterion,
              metric,
              writer,
              device=device)
        model.val()
        val(model,
            criterion,
            metric,
            data_loaders['val'],
            epoch,
            writer,
            device=device)
Beispiel #7
0
def check_for_gpu(params) -> object:
    device_id = params['cuda_device']
    if device_id is not None and device_id >= cuda.device_count():
        raise ConfigurationError(
            "Experiment specified a GPU but none is available;"
            " if you want to run on CPU use the override"
            " 'trainer.cuda_device=-1' in the json config file.")
 def __init__(self):
     self.numpy_version = numpy.__version__
     self.platform_version = platform.platform()
     if cuda.is_available():
         self.cuda_info = cuda.device_count()
     else:
         self.cuda_info = None
    def __init__(
        self,
        name: str,
        model: Model,
        optimizer: Optimizer,
        cuda_device: int,
        grad_norm: Optional[float] = None,
        scaler: Optional[amp.GradScaler] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
        momentum_scheduler: Optional[MomentumScheduler] = None
    ) -> "ComponentOptimizer":

        self.name = name
        self.model = model
        self._optimizer = optimizer

        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1

        check_for_gpu(cuda_device)
        self._cuda_device = int_to_device(cuda_device)
        self._grad_norm = grad_norm
        self._scaler = scaler
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._loss = {'train': ComponentLoss(), 'validation': ComponentLoss()}
Beispiel #10
0
    def system_info(self):
        uname = platform.uname()
        gpus = [cuda.get_device_name(i) for i in range(cuda.device_count())]

        self.update({
            'python':
            platform.python_version(),
            'machine':
            uname.machine,
            'processor':
            uname.processor,
            'os':
            os.name,
            'os_name':
            platform.system(),
            'os_ver':
            platform.release(),
            'memory':
            str(psutil.virtual_memory().total // 2**30) + ' GB',
            'storage':
            str(psutil.disk_usage('/').total // 2**30) + ' GB',
            'user':
            pwd.getpwuid(os.getuid())[0],
            'gpus':
            gpus,
            'timestamp':
            datetime.now().strftime('%f-%S-%M-%H-%d-%m-%Y')
        })
Beispiel #11
0
def get_info():
    return {
        "has_cuda":
        cuda.is_available(),
        "devices": [] if not cuda.is_available() else
        [cuda.get_device_name(i) for i in range(cuda.device_count())],
    }
Beispiel #12
0
def check_for_gpu(device_id: Union[int, List[int]]):
    if isinstance(device_id, list):
        for did in device_id:
            check_for_gpu(did)
    elif device_id is not None and device_id >= 0:
        num_devices_available = cuda.device_count()
        if num_devices_available == 0:
            # Torch will give a more informative exception than ours, so we want to include
            # that context as well if it's available.  For example, if you try to run torch 1.5
            # on a machine with CUDA10.1 you'll get the following:
            #
            #     The NVIDIA driver on your system is too old (found version 10010).
            #
            torch_gpu_error = ""
            try:
                cuda._check_driver()
            except Exception as e:
                torch_gpu_error = "\n{0}".format(e)

            raise ConfigurationError(
                "Experiment specified a GPU but none is available;"
                " if you want to run on CPU use the override"
                " 'trainer.cuda_device=-1' in the json config file." +
                torch_gpu_error)
        elif device_id >= num_devices_available:
            raise ConfigurationError(
                f"Experiment specified GPU device {device_id}"
                f" but there are only {num_devices_available} devices "
                f" available.")
Beispiel #13
0
    def __init__(self,
                 module,
                 device_ids=None,
                 output_device=None,
                 dim=0,
                 allow_dict=True,
                 allow_replication_callback=True,
                 user_scattered=False,
                 use_scatter_stream=True,
                 persistent=False,
                 copy_parameters=False,
                 copy_buffers=True):

        super(DataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device
        if len(self.device_ids) == 1:
            self.module.cuda(device_ids[0])

        self.allow_dict = allow_dict
        self.allow_replication_callback = allow_replication_callback
        self.user_scattered = user_scattered
        self.use_scatter_stream = use_scatter_stream
        self.persistent = persistent
        self.copy_parameters = copy_parameters
        self.copy_buffers = copy_buffers
        self.replicas = nn.ModuleList()
Beispiel #14
0
def get_num_nodes() -> int:
    """ Get the number of nodes. Note that this function assumes all nodes have the same number of processes.
    """

    if not is_distributed():
        return 1
    else:
        return get_world_size() // device_count()
def control_gpu_count(train_on_gpu):
    if train_on_gpu:
        gpu_count = cuda.device_count()
        f.write(str(gpu_count) + " gpus detected.\n")
        if gpu_count > 1:
            multi_gpu = True
        else:
            multi_gpu = False
Beispiel #16
0
def _get_stream(device):
    """Gets a background stream for copying between CPU and GPU"""
    global _streams
    if device == -1:
        return None
    if _streams is None:
        _streams = [None] * cuda.device_count()
    if _streams[device] is None: _streams[device] = cuda.Stream(device)
    return _streams[device]
def _get_stream(device):
    """Gets a background stream for copying between CPU and GPU"""
    global _streams
    if device == -1:
        return None
    if _streams is None:
        _streams = [None] * cuda.device_count()
    if _streams[device] is None: _streams[device] = cuda.Stream(device)
    return _streams[device]
Beispiel #18
0
 def wrapper(
     self: TorchModelBuilder[TModule, TOptimizer]
 ) -> Tuple[TModule, TOptimizer]:
     model, optimiser = construct_model_function(self)
     if cuda.is_available():
         model.cuda()
         model = DataParallel(model, device_ids=range(cuda.device_count()))
         backends.cudnn.benchmark = True
     return model, optimiser
Beispiel #19
0
def get_min_used_gpu():
    import torch.cuda as cutorch
    device = 0
    min_used = 1e+10
    for i in range(cutorch.device_count()):
        if min_used > torch.cuda.memory_allocated(i):
            min_used = torch.cuda.memory_allocated(i)
            device = i
    return device
Beispiel #20
0
def show_gpu_chooser(default=0, override=None):
    if override != None:
        if override.isdecimal():
            idx = int(override)
            if cuda.is_available() and idx in range(cuda.device_count()):
                return "cuda:{}".format(idx)
        return override
    if not cuda.is_available():
        return "cpu"
    gpustat.new_query().print_formatted(no_color=True)
    idx = input("Choose GPU (default {}):".format(default))
    if idx == "cpu":
        return "cpu"
    if idx.isdecimal():
        idx = int(idx)
        if idx in range(cuda.device_count()):
            return "cuda:{}".format(idx)
    return "cuda:{}".format(default)
Beispiel #21
0
    def __init__(self, gpu, config, title=None):
        self.config = config
        self.title = title

        if gpu is None:
            gpu = [str(idx) for idx in range(cuda.device_count())
                   ] if cuda.is_available() else None

        self.gpu = gpu
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir, include_workers="all", save_all=True)
    num_workers = 1 if bool(device_count()) is False else device_count()
    mode_args = list(SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS)
    launch_smdataparallel_job(
        script_file_path=SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) > 25
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
 def __str__(self):
     s = six.StringIO()
     s.write('''Platform: {}\n'''.format(self.platform_version))
     s.write('''NumPy: {}\n'''.format(self.numpy_version))
     if self.cuda_info is None:
         s.write('''CUDA: Not Available\n''')
     else:
         s.write('''CUDA: {}\n'''.format(cuda.device_count()))
     return s.getvalue()
Beispiel #24
0
def get_pretrained_model(model_name):
    """Retrieve a pre-trained model from torchvision

    Params
    -------
        model_name (str): name of the model (currently only accepts vgg16 and resnet50)

    Return
    --------
        model (PyTorch model): cnn

    """
    n_classes = 102
    # Whether to train on gpu
    train_on_gpu = cuda.is_available()
    # Number of gpus
    if train_on_gpu:
        gpu_count = cuda.device_count()
        print(f'{gpu_count} gpus detected.')
        if gpu_count > 1:
            multi_gpu = True
        else:
            multi_gpu = False
    else:
        multi_gpu = False

    if model_name == 'vgg16':
        model = models.vgg16(pretrained=True)
        # Freeze early layers
        for param in model.parameters():
            param.requires_grad = False
        n_inputs = model.classifier[6].in_features
        # Add on classifier
        model.classifier[6] = nn.Sequential(
            nn.Linear(n_inputs, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, n_classes), nn.LogSoftmax(dim=1))

    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=True)

        for param in model.parameters():
            param.requires_grad = False

        n_inputs = model.fc.in_features
        model.fc = nn.Sequential(
            nn.Linear(n_inputs, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, n_classes), nn.LogSoftmax(dim=1))

    # Move to gpu and parallelize
    if train_on_gpu:
        model = model.to('cuda')

    if multi_gpu:
        model = nn.DataParallel(model)

    return model
Beispiel #25
0
def system_info():
    print(sys.version, "\n")
    print(f"PyTorch {torch.__version__} \n")
    print(f"Torch-vision {torchvision.__version__} \n")
    print("Available devices:")
    if cuda.is_available():
        for i in range(cuda.device_count()):
            print(f"{i}: {cuda.get_device_name(i)}")
    else:
        print("CPUs only, no GPUs found")
Beispiel #26
0
def __get_min_used_gpu(k):
    import torch.cuda as cutorch
    device = []
    min_used = 1e+10
    for i in range(cutorch.device_count()):
        device.append(torch.cuda.memory_allocated(i))
    print(device)
    _, top_device = torch.topk(torch.tensor(device), k, largest=False)
    top_device = list(top_device)
    return top_device
Beispiel #27
0
def system_info():
    print(sys.version, "\n")
    print("PyTorch {}".format(torch.__version__), "\n")
    print("Torch-vision {}".format(torchvision.__version__), "\n")
    print("Available devices:")
    if cuda.is_available():
        for i in range(cuda.device_count()):
            print("{}: {}".format(i, cuda.get_device_name(i)))
    else:
        print("CPUs")
Beispiel #28
0
def parallelize(model):
    """
	Wrap pytorch model in layer to run on multiple GPUs
	"""
    import torch.cuda as cuda
    import torch.nn as nn

    device_ids = [i for i in range(cuda.device_count())]
    model = nn.DataParallel(model, device_ids=device_ids)
    return model
def get_free_gpus(bytes_needed=0):
    free_gpus = dict()
    gpu_stats = gpustat.new_query()
    for i in range(cuda.device_count()):
        bytes_free = 2**20 * (gpu_stats[i]["memory.total"] -
                              gpu_stats[i]["memory.used"])
        if bytes_free > bytes_needed:
            free_gpus[i] = bytes_free
    free_gpus = dict(
        sorted(free_gpus.items(), key=lambda gpu: gpu[1], reverse=True))
    return list(free_gpus.keys())
def check_for_gpu(device_id: int):
    if device_id is not None and device_id >= 0:
        num_devices_available = cuda.device_count()
        if num_devices_available == 0:
            raise ConfigurationError("Experiment specified a GPU but none are available;"
                                     " if you want to run on CPU use the override"
                                     " 'trainer.cuda_device=-1' in the json config file.")
        elif device_id >= num_devices_available:
            raise ConfigurationError(f"Experiment specified GPU device {device_id}"
                                     f" but there are only {num_devices_available} devices "
                                     f" available.")
Beispiel #31
0
def check_for_gpu(device_id: int):
    if device_id is not None and device_id >= 0:
        num_devices_available = cuda.device_count()
        if num_devices_available == 0:
            raise ConfigurationError("Experiment specified a GPU but none are available;"
                                     " if you want to run on CPU use the override"
                                     " 'trainer.cuda_device=-1' in the json config file.")
        elif device_id >= num_devices_available:
            raise ConfigurationError(f"Experiment specified GPU device {device_id}"
                                     f" but there are only {num_devices_available} devices "
                                     f" available.")
Beispiel #32
0
    def set_gpu(self, args):
        # args.gpuid = ""  # TODO disable cuda
        if args.gpuid[0] == -1:
            self.use_cuda = False
        else:
            torch.cuda.set_device(args.gpuid[0])
            self.use_cuda = True
        # self.use_cuda = (len(args.gpuid) >= 1)

        print("{0} GPU(s) are available".format(cuda.device_count()))
        print("Using GPU {}".format(args.gpuid[0]))
Beispiel #33
0
def check_for_gpu(device_id: int):
    if device_id is not None and device_id >= cuda.device_count():
        raise ConfigurationError("Experiment specified a GPU but none is available;"
                                 " if you want to run on CPU use the override"
                                 " 'trainer.cuda_device=-1' in the json config file.")