def test_sgd_update(): net = nn.layers.LinearLayer(100, 10) learning_rate = 1 optimizer = sgd_optimizer.SGDOptimizer(net.parameters(), learning_rate) data = np.random.random((20, 100)).astype(np.float32) * 2 - 1 initial_weight = net.weight.data.copy() initial_bias = net.bias.data.copy() torch_net = TorchNet() with torch.no_grad(): torch_net.layer.weight[:] = utils.from_numpy(net.weight.data.T) torch_net.layer.bias[:] = utils.from_numpy(net.bias.data) torch_optimizer = torch.optim.SGD(torch_net.parameters(), learning_rate) optimizer.zero_grad() out = net(data) loss = out.sum() net.backward(np.ones_like(out)) torch_optimizer.zero_grad() torch_out = torch_net(utils.from_numpy(data)) assert np.allclose(out, utils.to_numpy(torch_out.clone().detach()), atol=0.001) torch_loss = torch_out.sum() assert np.allclose(loss, torch_loss.item(), atol=0.001) torch_loss.backward() assert np.allclose(net.weight.grad.T, utils.to_numpy(torch_net.layer.weight.grad)) assert np.allclose(net.bias.grad, utils.to_numpy(torch_net.layer.bias.grad)) optimizer.step() torch_optimizer.step() assert np.allclose(net.weight.data.T, utils.to_numpy(torch_net.layer.weight)) assert np.allclose(net.bias.data, utils.to_numpy(torch_net.layer.bias)) assert not np.allclose(net.weight.data, initial_weight) assert not np.allclose(net.bias.data, initial_bias)
def _test_linear_backward(input_shape, out_channels,): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 layer = LinearLayer(in_channels, out_channels) torch_layer = nn.Linear(in_channels, out_channels, bias=True) with torch.no_grad(): torch_layer.weight[:] = torch.from_numpy(layer.weight.data).transpose(0, 1) torch_layer.bias[:] = torch.from_numpy(layer.bias.data) output = layer.forward(input) out_grad = layer.backward(np.ones_like(output)) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) torch_out.sum().backward() torch_out_grad = utils.to_numpy(torch_input.grad) out_grad[np.abs(out_grad) < 1e-4] = 0 torch_out_grad[np.abs(torch_out_grad) < 1e-4] = 0 assert np.allclose(out_grad, torch_out_grad, atol=TOLERANCE) w_grad = layer.weight.grad w_grad[np.abs(w_grad) < 1e-4] = 0 torch_w_grad = utils.to_numpy(torch_layer.weight.grad.transpose(0, 1)) torch_w_grad[np.abs(torch_w_grad) < 1e-4] = 0 print(w_grad) print(torch_w_grad) print() print("--------------") print() assert np.allclose(w_grad, torch_w_grad, atol=TOLERANCE) b_grad = layer.bias.grad b_grad[np.abs(b_grad) < 1e-4] = 0 torch_b_grad = utils.to_numpy(torch_layer.bias.grad) torch_b_grad[np.abs(torch_b_grad) < 1e-4] = 0 assert np.allclose(b_grad, torch_b_grad, atol=TOLERANCE)
def _test_backward(input_shape, reduction, axis): #np.random.Seed(0) layer = SoftmaxCrossEntropyLossLayer(reduction=reduction) data = np.random.random(input_shape) * 2 - 1 labels_shape = list(data.shape) labels_shape.pop(axis) labels = np.random.randint(0, data.shape[axis], labels_shape) loss = layer(data, labels, axis=axis) if axis == 1: torch_input = utils.from_numpy(data).requires_grad_(True) else: torch_input = utils.from_numpy(np.moveaxis(data, axis, 1)).requires_grad_(True) pytorch_loss = F.cross_entropy(torch_input, utils.from_numpy(labels), reduction=reduction) if len(pytorch_loss.shape) > 0: pytorch_loss.sum().backward() else: pytorch_loss.backward() assert np.allclose(loss, utils.to_numpy(pytorch_loss)) grad = layer.backward() torch_grad = utils.to_numpy(torch_input.grad) if axis != 1: torch_grad = np.moveaxis(torch_grad, 1, axis) print(torch_grad.shape) print(grad.shape) print("grad - torch_grad < .000001") print(np.absolute(grad - torch_grad) < .9) assert np.allclose(grad, torch_grad, atol=0.001)
def _test_relu_forward(input_shape, out_channels): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = ReLULayer() torch_layer = nn.ReLU() output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = utils.to_numpy(torch_layer(torch_data)) output[np.abs(output) < 1e-4] = 0 torch_out[np.abs(torch_out) < 1e-4] = 0 assert np.all(input == original_input) assert output.shape == torch_out.shape assert np.allclose(output, torch_out, atol=TOLERANCE)
def _test_max_pool_backward(input_shape, kernel_size, stride): np.random.seed(0) torch.manual_seed(0) padding = (kernel_size - 1) // 2 input = np.random.random(input_shape).astype(np.float32) * 20 layer = MaxPoolLayer(kernel_size, stride) torch_layer = nn.MaxPool2d(kernel_size, stride, padding) output = layer.forward(input) out_grad = layer.backward(2 * np.ones_like(output) / output.size) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) (2 * torch_out.mean()).backward() torch_out_grad = utils.to_numpy(torch_input.grad) utils.assert_close(out_grad, torch_out_grad, atol=TOLERANCE)
def _test_forward_overflow(input_shape, reduction, axis): layer = SoftmaxCrossEntropyLossLayer(reduction=reduction) data = np.random.random(input_shape) * 10000 - 1 labels_shape = list(data.shape) labels_shape.pop(axis) labels = np.random.randint(0, data.shape[axis], labels_shape) loss = layer(data, labels, axis=axis) if axis == 1: pytorch_loss = F.cross_entropy(utils.from_numpy(data), utils.from_numpy(labels), reduction=reduction) else: pytorch_loss = F.cross_entropy(utils.from_numpy(data.swapaxes(1, axis)), utils.from_numpy(labels), reduction=reduction) pytorch_loss = utils.to_numpy(pytorch_loss) assert np.allclose(loss, pytorch_loss, atol=0.001)
def _test_max_pool_forward(input_shape, kernel_size, stride): return np.random.seed(0) torch.manual_seed(0) padding = (kernel_size - 1) // 2 input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = MaxPoolLayer(kernel_size, stride) torch_layer = nn.MaxPool2d(kernel_size, stride, padding) output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = utils.to_numpy(torch_layer(torch_data)) output[np.abs(output) < 1e-4] = 0 torch_out[np.abs(torch_out) < 1e-4] = 0 assert np.all(input == original_input) assert output.shape == torch_out.shape utils.assert_close(output, torch_out, atol=TOLERANCE)
def _test_linear_forward(input_shape, out_channels): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = LinearLayer(in_channels, out_channels) torch_layer = nn.Linear(in_channels, out_channels, bias=True) with torch.no_grad(): torch_layer.weight[:] = torch.from_numpy(layer.weight.data).transpose(0, 1) torch_layer.bias[:] = torch.from_numpy(layer.bias.data) output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = utils.to_numpy(torch_layer(torch_data)) output[np.abs(output) < 1e-4] = 0 torch_out[np.abs(torch_out) < 1e-4] = 0 assert np.all(input == original_input) assert output.shape == torch_out.shape assert np.allclose(output, torch_out, atol=TOLERANCE)
def _test_relu_backward( input_shape, out_channels, ): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 layer = ReLULayer() torch_layer = nn.ReLU() output = layer.forward(input) out_grad = layer.backward(np.ones_like(output)) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) torch_out.sum().backward() torch_out_grad = utils.to_numpy(torch_input.grad) out_grad[np.abs(out_grad) < 1e-4] = 0 torch_out_grad[np.abs(torch_out_grad) < 1e-4] = 0 assert np.allclose(out_grad, torch_out_grad, atol=TOLERANCE)