def test_multi_gpu(): from renom.cuda import cuGetDeviceCount class NN2(rm.Model): def __init__(self): super(NN2, self).__init__() self.layer1 = rm.Dense(output_size=2) self.layer2 = rm.Dense(output_size=2) def forward(self, x): return self.layer2(rm.relu(self.layer1(x))) def weight_initiallize(self, input_size): self.layer1.weight_initiallize(input_size) self.layer2.weight_initiallize(input_size) nn = NN2() nn.set_gpu(0) nn.weight_initiallize((2, )) nn2 = NN2() nn2.set_gpu(cuGetDeviceCount() - 1) for i in range(2): nn2.copy_params(nn) x = np.random.rand(100, 2) with nn.train(): ret1 = nn(x[:50]) with use_device(nn.device_id): loss1 = rm.softmax_cross_entropy(ret1, np.random.rand(50, 2)) with nn2.train(): ret2 = nn2(x[50:]) with use_device(nn2.device_id): loss2 = rm.softmax_cross_entropy(ret2, np.random.rand(50, 2)) nn.sync() nn2.sync() grad1 = loss1.grad() with use_device(nn2.device_id): grad2 = loss2.grad() grad2.get(nn2.layer1.params.w) org_l1_w = grad1.get(nn.layer1.params.w) nn.join_grads(grad1, [(nn2, grad2)]) assert np.allclose(grad1.get(nn.layer1.params.w), org_l1_w + grad2.get(nn2.layer1.params.w).copy()) grad1.update(models=[nn])
def __itruediv__(self, other): with use_device(self.device_id): assert getattr(self, "shape", (1,)) == getattr(self, "shape", (1,)) new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) cudiv(self, other, ret) return ret
def __call__(self, x, *args, **kwargs): with use_device(self._device_id): if not self.params: assert len( x.shape) > 1, "Input must be at least of 2 dimensions." self.weight_initiallize(x.shape[1:]) return super(Parametrized, self).__call__(x, *args, **kwargs)
def __add__(self, other): with use_device(self.device_id): new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) # Only data type float32 is acceptable. cuadd(self, other, ret) return ret
def join_grads(self, grads, others): """Merge gradients of other models. Others is a list of tuple of (model, grads) to be merged. Models listed in the others should have same structure with self.""" values = { name: params for name, params, attrs in self.flatten_values() } for model, _grads in others: o = model._get_grads(_grads) for (name, attrname), diff in o.items(): obj = values[name][attrname] curdiff = grads.get(obj, None) if curdiff is not None: if not isinstance(curdiff, Node): curdiff = Node(curdiff) if not isinstance(diff, Node): diff = Node(diff) with use_device(curdiff.device_id): if GPUValue is not None and diff.device_id != curdiff.device_id: g = GPUValue(shape=diff.shape) g.copy_from(diff.get_gpu()) diff = Node(g) newdiff = curdiff + diff grads.set(obj, newdiff)
def copy(self): if cuGetDevice() == self.device_id: ret = GPUValue(shape=self.shape) self._ptr.memcpyD2D(ret._ptr, self.nbytes) else: with use_device(self.device_id): arr = self.new_array() ret = GPUValue(arr) return ret
def _oper_pow(self, other): if not isinstance(self, GPUValue): return other.__rpow__(self, modulo) with use_device(self.device_id): new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) cupow(self, other, ret) return ret
def __call__(self): set_cuda_active(True) with self.gpu_resource: self._gpu = self.gpus.pop() try: with use_device(self._gpu): return self._exec() finally: self.gpus.add(self._gpu)
def __truediv__(self, other): if not isinstance(self, GPUValue): return other.__rtruediv__(self) with use_device(self.device_id): new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) cudiv(self, other, ret) return ret
def sync(self): if is_cuda_active(): done = set() for m in self.iter_models(): device_id = m._device_id if device_id not in done: done.add(device_id) with use_device(m._device_id): renom.cuda.cuDeviceSynchronize()
def run(self, f, *args, **kwargs): with self.gpu_resource: self.active_gpu.id = self.gpus.pop() try: set_cuda_active(True) with use_device(self.active_gpu.id): return f(*args, **kwargs) finally: self.gpus.add(self.active_gpu.id) release_mem_pool()
def __getitem__(self, indexes): with use_device(self.device_id): slices, result_shapes, dest_shapes = build_shapes(self, indexes) dest_size = calc_int_prod(dest_shapes) ret = cu_get_item(self, self.size, dest_size, slices) ret.shape = tuple(result_shapes) return ret
def T(self): with use_device(self.device_id): n = len(self.shape) assert n < 3 clone = self.zeros_like_me() if n == 2: new_shape = list(clone.shape) with cublas.cublas_handler() as cublas_handle: cublas.cublas_transpose(cublas_handle, self, clone) new_shape[0] = clone.shape[1] new_shape[1] = clone.shape[0] clone.shape = tuple(new_shape) return clone
def to_gpu(self, value): if value.dtype is not self.dtype: value = value.astype(self.dtype) assert value.shape == self.shape, "{} {}".format(value.shape, self.shape) if not self._ptr: self.nbytes = value.nbytes self.alloc() # todo: value.flatten() copies buffer with use_device(self.device_id): self._ptr.memcpyH2D(value.ravel(), value.nbytes)
def test_copy_from_another_gpu(): set_cuda_active(True) src = Variable(rand((100, ))) src.to_gpu() with use_device(1): dest = Variable(rand((100, ))) dest.to_gpu() dest.copy_from(src) close(src, dest) close(src._gpu.new_array(), dest._gpu.new_array())
def __call__(self, x, *args, **kwargs): with use_device(self._device_id): if self._model_hook: x, args, kwargs = self._model_hook.call_enter(self, x, args, kwargs) if not self._model_hook: ret = self.forward(x, *args, **kwargs) else: ret = self._model_hook.on_forward(self, self.forward, x, args, kwargs) if self._model_hook: ret = self._model_hook.call_leave(self, ret, x, args, kwargs) return ret
def copy_params(self, model): value_list = model.flatten_values() with use_device(self._device_id): for names, values, attrs in value_list: layer = self for name in names[1:]: layer = getattr(layer, name) for k, v in values.items(): if k in layer.params: layer.params[k].copy_from(v) else: layer.params[k] = v.copy() layer.params[k]._auto_update = v._auto_update
def __setitem__(self, indexes, value): with use_device(self.device_id): value = get_gpu(value) slices, result_shapes, dest_shapes = build_shapes(self, indexes) if calc_int_prod(result_shapes) == 0: return dest_strides = calc_strides(dest_shapes) mask, broadcasted = _build_broadcast_mask(dest_shapes, value.shape) broadcasted_strides = calc_strides(broadcasted) broadcasted_strides = [m * b for m, b in zip(mask, broadcasted_strides)] valuesize = calc_int_prod(dest_shapes) cu_set_item(value, valuesize, self, slices, dest_strides, broadcasted_strides)
def __idiv__(self, other): with use_device(self.device_id): return self.__itruediv__(other)
def train(self, train_distributor, test_distributor=None): """Train method. This method executes train loop. If test_distributor is given, validation loss will be calculated. Args: train_distributor (Distributor): Distributor for yielding train data. test_distributor (Distributor): Distributor for yielding test data. """ self.epoch = 0 self.train_distributor = train_distributor self.test_distributor = test_distributor self.on_event('start') self.train_loss_list = [] self.test_loss_list = [] models = [self.model] if self.num_gpu > 1: models.extend( [self.model.__class__() for _ in range(self.num_gpu - 1)]) for n in range(self.num_gpu): models[n].set_gpu(n) while self.epoch < self.num_epoch: self.on_event('start_epoch') self.nth = 0 self.avg_train_loss = 0 for iteration, (data, target) in enumerate( self.train_distributor.batch(self.batch_size, self.shuffle)): datalen = len(data) // len(models) self.data = [ data[i:i + datalen] for i in range(0, datalen * len(models), datalen) ] if is_cuda_active(): self.data = [Node(d) for d in self.data] for n, d in enumerate(self.data): with use_device(n): d.to_gpu() targetlen = len(target) // len(models) self.targets = [ target[i:i + targetlen] for i in range(0, targetlen * len(models), targetlen) ] if is_cuda_active(): self.targets = [Node(d) for d in self.targets] for n, d in enumerate(self.targets): with use_device(n): d.to_gpu() for gpu in range(1, self.num_gpu): models[gpu].copy_params(models[0]) for gpu in range(0, self.num_gpu): models[gpu].set_models(inference=False) self.on_event('forward') self.outputs = [] for gpu in range(self.num_gpu): model = models[gpu] with model.train(): self.outputs.append(model(self.data[gpu])) self.on_event('loss') self.losses = [] for gpu in range(self.num_gpu): model = models[gpu] with use_device(gpu): self.losses.append( self.loss_func(self.outputs[gpu], self.targets[gpu])) self.avg_train_loss += (self.losses[0] - self.avg_train_loss) / (iteration + 1) self.on_event('backward') self.grads = [] for gpu in range(self.num_gpu): model = models[gpu] with use_device(gpu): self.grads.append(self.losses[gpu].grad()) self.on_event('grad') if self.num_gpu > 1: models[0].join_grads(self.grads[0], zip(models[1:], self.grads[1:])) self.grads[0].update(self.optimizer) self.on_event('updated') self.nth += 1 self.on_event('end_epoch') self.epoch += 1 # release objects self.data = self.target = None self.outputs = self.losses = self.grads = None self.avg_train_loss = None
def __sub__(self, other): with use_device(self.device_id): new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) cusub(self, other, ret) return ret
def __call__(self, x, *args, **kwargs): with use_device(self._device_id): if not self.params: self.weight_initiallize(x.shape[1:]) return super(Parametrized, self).__call__(x, *args, **kwargs)
def __call__(self, *args, **kwargs): with use_device(self._device_id): return self.forward(*args, **kwargs)
def __rpow__(self, other, modulo): with use_device(self.device_id): new_shape = calc_broadcast_shape(self, other) ret = GPUValue(shape=new_shape) curpow(self, other, ret) return ret
def __div__(self, other): if not isinstance(self, GPUValue): return other.__rdiv__(self) with use_device(self.device_id): return self.__truediv__(other)
def __isub__(self, other): with use_device(self.device_id): assert getattr(self, "shape", (1,)) == getattr(self, "shape", (1,)) cublas.cublas_axpy(-get_gpu(other), get_gpu(self)) return self
def __rmul__(self, other): with use_device(self.device_id): return self.__mul__(other)
def __call__(self, x, *args, **kwargs): with use_device(self._device_id): x = self.mark_enter(x) ret = self.forward(x, *args, **kwargs) return self.mark_leave(ret)
def __call__(self, x): with use_device(self._device_id): return self.forward(x)