def _test_backward(input_shape, reduction, axis): layer = SoftmaxCrossEntropyLossLayer(reduction=reduction) data = np.random.random(input_shape) * 2 - 1 labels_shape = list(data.shape) labels_shape.pop(axis) labels = np.random.randint(0, data.shape[axis], labels_shape).astype(np.int64) loss = layer(data, labels, axis=axis) if axis == 1: torch_input = utils.from_numpy(data).requires_grad_(True) else: torch_input = utils.from_numpy(np.moveaxis(data, axis, 1)).requires_grad_(True) pytorch_loss = F.cross_entropy(torch_input, utils.from_numpy(labels), reduction=reduction) if len(pytorch_loss.shape) > 0: pytorch_loss.sum().backward() else: pytorch_loss.backward() utils.assert_close(loss, utils.to_numpy(pytorch_loss)) grad = layer.backward() torch_grad = utils.to_numpy(torch_input.grad) if axis != 1: torch_grad = np.moveaxis(torch_grad, 1, axis) utils.assert_close(grad, torch_grad, atol=0.001)
def _test_conv_forward(input_shape, out_channels, kernel_size, stride): np.random.seed(0) torch.manual_seed(0) in_channels = input_shape[1] padding = (kernel_size - 1) // 2 input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = ConvLayer(in_channels, out_channels, kernel_size, stride) torch_layer = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=True) utils.assign_conv_layer_weights(layer, torch_layer) output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = torch_layer(torch_data) assert np.all(input == original_input) assert output.shape == torch_out.shape utils.assert_close(output, torch_out, atol=TOLERANCE)
def _test_conv_backward(input_shape, out_channels, kernel_size, stride): np.random.seed(0) torch.manual_seed(0) in_channels = input_shape[1] #print('test ksize',kernel_size) #print('strid',stride) padding = (kernel_size - 1) // 2 #print('test pad',padding) input = np.random.random(input_shape).astype(np.float32) * 20 layer = ConvLayer(in_channels, out_channels, kernel_size, stride) torch_layer = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=True) utils.assign_conv_layer_weights(layer, torch_layer) output = layer.forward(input) out_grad = layer.backward(2 * np.ones_like(output) / output.size) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) (2 * torch_out.mean()).backward() utils.assert_close(out_grad, torch_input.grad, atol=TOLERANCE) utils.check_conv_grad_match(layer, torch_layer)
def test_reduce_catted_sequences(data, batch_sizes, in_dim, hidden_dim, device): sequences = [[ torch.randn((token_size, in_dim), requires_grad=True, device=device) for token_size in data.draw( token_size_lists(max_token_size=TINY_TOKEN_SIZE, max_batch_size=TINY_BATCH_SIZE)) ] for _ in batch_sizes] inputs = [token for sequence in sequences for token in sequence] catted_sequences = [ cat_sequence(sequence, device=device) for sequence in sequences ] packed_sequences = [ pack_sequence(sequence, device=device) for sequence in sequences ] rnn = nn.LSTM( input_size=in_dim, hidden_size=hidden_dim, bidirectional=True, bias=True, ).to(device=device) reduction_pack = reduce_catted_sequences(catted_sequences, device=device) _, (actual, _) = rnn(reduction_pack) actual = rearrange(actual, 'd n x -> n (d x)') excepted = [] for pack in packed_sequences: _, (t, _) = rnn(pack) excepted.append(rearrange(t, 'd n x -> n (d x)')) excepted = pack_sequence(excepted).data assert_close(actual, excepted, check_stride=False) assert_grad_close(actual, excepted, inputs=inputs)
def test_pad_sequence(data, token_sizes, dim, batch_first, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] actual = rua.pad_sequence(inputs, batch_first=batch_first) excepted = tgt.pad_sequence(inputs, batch_first=batch_first) assert_close(actual, excepted) assert_grad_close(actual, excepted, inputs=inputs)
def test_tree_reduce_catted_sequence(data, token_sizes, dim, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] excepted = pad_sequence(inputs, device=device).sum(dim=0) catted_sequence, token_sizes = cat_sequence(inputs, device=device) indices = tree_reduce_catted_indices(token_sizes=token_sizes) actual = tree_reduce_sequence(torch.add)(catted_sequence.data, indices) assert_close(actual, excepted)
def _test_backward_approx(layer, data_shape): h = 1e-4 data = np.random.random(data_shape) * 10 - 5 data[np.abs(data) < h] = 1 output1 = layer.forward(data + h) output2 = layer.forward(data - h) output = layer.forward(data) previous_partial_gradient = np.ones_like(output) output_gradient = layer.backward(previous_partial_gradient) utils.assert_close((output1 - output2) / (2 * h), output_gradient)
def test_cat_packed_sequence(data, token_sizes, dim, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] packed_sequence = tgt.pack_sequence(inputs, enforce_sorted=False) actual_data, actual_token_sizes = rua.cat_sequence(inputs, device=device) expected_data, expected_token_sizes = rua.cat_packed_sequence( packed_sequence, device=device) assert_close(actual_data, expected_data) assert_equal(actual_token_sizes, expected_token_sizes) assert_grad_close(actual_data, expected_data, inputs=inputs)
def test_select_last(data, token_sizes, dim, unsort, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] packed_sequence = pack_sequence(inputs, enforce_sorted=False) actual = select_last(sequence=packed_sequence, unsort=unsort) if not unsort: actual = actual[packed_sequence.unsorted_indices] expected = torch.stack([sequence[-1] for sequence in inputs], dim=0) assert_close(actual, expected) assert_grad_close(actual, expected, inputs=inputs)
def test_cat_padded_sequence(data, token_sizes, dim, batch_first, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] padded_sequence = tgt.pad_sequence(inputs, batch_first=batch_first) token_sizes = torch.tensor(token_sizes, device=device) actual_data, actual_token_sizes = rua.cat_sequence(inputs, device=device) expected_data, expected_token_sizes = rua.cat_padded_sequence( padded_sequence, token_sizes, batch_first=batch_first, device=device) assert_close(actual_data, expected_data) assert_equal(actual_token_sizes, expected_token_sizes) assert_grad_close(actual_data, expected_data, inputs=inputs)
def test_pad_packed_sequence(data, token_sizes, dim, batch_first, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] packed_sequence = tgt.pack_sequence(inputs, enforce_sorted=False) excepted_token_sizes = torch.tensor(token_sizes, device=torch.device('cpu')) excepted = tgt.pad_sequence(inputs, batch_first=batch_first) actual, actual_token_sizes = rua.pad_packed_sequence( packed_sequence, batch_first=batch_first) assert_close(actual, excepted) assert_grad_close(actual, excepted, inputs=inputs) assert_equal(actual_token_sizes, excepted_token_sizes)
def _test_linear_backward(input_shape, out_channels): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 layer = LinearLayer(in_channels, out_channels) torch_layer = nn.Linear(in_channels, out_channels, bias=True) utils.assign_linear_layer_weights(layer, torch_layer) output = layer.forward(input) out_grad = layer.backward(np.ones_like(output) * 2) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) (2 * torch_out).sum().backward() utils.assert_close(out_grad, torch_input.grad, atol=TOLERANCE) utils.check_linear_grad_match(layer, torch_layer, tolerance=TOLERANCE)
def _test_linear_forward(input_shape, out_channels): in_channels = input_shape[1] input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = LinearLayer(in_channels, out_channels) torch_layer = nn.Linear(in_channels, out_channels, bias=True) utils.assign_linear_layer_weights(layer, torch_layer) output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = torch_layer(torch_data) assert np.all(input == original_input) assert output.shape == torch_out.shape utils.assert_close(output, torch_out, atol=TOLERANCE)
def test_tree_reduce_packed_sequence(data, token_sizes, dim, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] excepted = pad_sequence(inputs, device=device).sum(dim=0) packed_sequence = pack_sequence(inputs, device=device) indices = tree_reduce_packed_indices( batch_sizes=packed_sequence.batch_sizes) actual = tree_reduce_sequence(torch.add)(packed_sequence.data, indices) actual = actual[packed_sequence.unsorted_indices] assert_close(actual, excepted) assert_grad_close(actual, excepted, inputs=inputs)
def _test_max_pool_backward(input_shape, kernel_size, stride): np.random.seed(0) torch.manual_seed(0) padding = (kernel_size - 1) // 2 input = np.random.random(input_shape).astype(np.float32) * 20 layer = MaxPoolLayer(kernel_size, stride) torch_layer = nn.MaxPool2d(kernel_size, stride, padding) output = layer.forward(input) out_grad = layer.backward(2 * np.ones_like(output) / output.size) torch_input = utils.from_numpy(input).requires_grad_(True) torch_out = torch_layer(torch_input) (2 * torch_out.mean()).backward() torch_out_grad = utils.to_numpy(torch_input.grad) utils.assert_close(out_grad, torch_out_grad, atol=TOLERANCE)
def test_tree_reduce_padded_sequence(data, token_sizes, dim, batch_first, device): inputs = [ torch.randn((token_size, dim), device=device, requires_grad=True) for token_size in token_sizes ] excepted = pad_sequence(inputs, device=device).sum(dim=0) padded_sequence = pad_sequence(inputs, device=device, batch_first=batch_first) token_sizes = torch.tensor(token_sizes, device=device) indices = tree_reduce_padded_indices(token_sizes=token_sizes, batch_first=batch_first) actual = tree_reduce_sequence(torch.add)(padded_sequence.data, indices) assert_close(actual, excepted)
def test_chunk_packed_sequence(batch_size, token_sizes, embedding_dim, dim, batch_first, device): excepted_sequences = sequences = [ pack_sequence([ torch.randn((token_size, embedding_dim), device=device, requires_grad=True) for token_size in token_sizes ], enforce_sorted=False) for _ in range(batch_size) ] actual_sequences = chunk_packed_sequence( sequence=stack_packed_sequences(sequences=sequences, dim=dim), chunks=len(sequences), dim=dim, ) for actual_sequence, excepted_sequence in zip(actual_sequences, excepted_sequences): actual, actual_token_sizes = pad_packed_sequence(actual_sequence, batch_first=batch_first) excepted, excepted_token_sizes = pad_packed_sequence(excepted_sequence, batch_first=batch_first) assert_close(actual, excepted) assert_equal(actual_token_sizes, excepted_token_sizes)
def _test_max_pool_forward(input_shape, kernel_size, stride): np.random.seed(0) torch.manual_seed(0) padding = (kernel_size - 1) // 2 input = np.random.random(input_shape).astype(np.float32) * 20 original_input = input.copy() layer = MaxPoolLayer(kernel_size, stride) torch_layer = nn.MaxPool2d(kernel_size, stride, padding) output = layer.forward(input) torch_data = utils.from_numpy(input) torch_out = utils.to_numpy(torch_layer(torch_data)) output[np.abs(output) < 1e-4] = 0 torch_out[np.abs(torch_out) < 1e-4] = 0 assert np.all(input == original_input) assert output.shape == torch_out.shape utils.assert_close(output, torch_out, atol=TOLERANCE)
def _test_forward(input_shape, reduction, axis): layer = SoftmaxCrossEntropyLossLayer(reduction=reduction) data = np.random.random(input_shape) * 2 - 1 labels_shape = list(data.shape) labels_shape.pop(axis) labels = np.random.randint(0, data.shape[axis], labels_shape).astype(np.int64) loss = layer(data, labels, axis=axis) if axis == 1: pytorch_loss = F.cross_entropy(utils.from_numpy(data), utils.from_numpy(labels), reduction=reduction) else: pytorch_loss = F.cross_entropy(utils.from_numpy(data.swapaxes(1, axis)), utils.from_numpy(labels), reduction=reduction) pytorch_loss = utils.to_numpy(pytorch_loss) utils.assert_close(loss, pytorch_loss, atol=0.001)
def test_networks(): np.random.seed(0) torch.manual_seed(0) data = np.random.random((100, 1, 28, 28)).astype(np.float32) * 10 - 5 labels = np.random.randint(0, 10, 100).astype(np.int64) net = MNISTResNetwork() torch_net = TorchMNISTResNetwork() utils.assign_conv_layer_weights(net.layers[0], torch_net.layers[0]) utils.assign_conv_layer_weights(net.layers[3], torch_net.layers[3]) utils.assign_conv_layer_weights(net.layers[4].conv_layers[0], torch_net.layers[4].conv1) utils.assign_conv_layer_weights(net.layers[4].conv_layers[2], torch_net.layers[4].conv2) utils.assign_conv_layer_weights(net.layers[5].conv_layers[0], torch_net.layers[5].conv1) utils.assign_conv_layer_weights(net.layers[5].conv_layers[2], torch_net.layers[5].conv2) utils.assign_linear_layer_weights(net.layers[9], torch_net.layers[9]) utils.assign_linear_layer_weights(net.layers[11], torch_net.layers[11]) utils.assign_linear_layer_weights(net.layers[13], torch_net.layers[13]) forward = net(data) data_torch = utils.from_numpy(data).requires_grad_(True) forward_torch = torch_net(data_torch) utils.assert_close(forward, forward_torch) loss = net.loss(forward, labels) torch_loss = torch_net.loss(forward_torch, utils.from_numpy(labels)) utils.assert_close(loss, torch_loss) out_grad = net.backward() torch_loss.backward() utils.assert_close(out_grad, data_torch.grad, atol=0.01) tolerance = 1e-4 utils.check_linear_grad_match(net.layers[13], torch_net.layers[13], tolerance=tolerance) utils.check_linear_grad_match(net.layers[11], torch_net.layers[11], tolerance=tolerance) utils.check_linear_grad_match(net.layers[9], torch_net.layers[9], tolerance=tolerance) utils.check_conv_grad_match(net.layers[5].conv_layers[2], torch_net.layers[5].conv2, tolerance=tolerance) utils.check_conv_grad_match(net.layers[5].conv_layers[0], torch_net.layers[5].conv1, tolerance=tolerance) utils.check_conv_grad_match(net.layers[4].conv_layers[2], torch_net.layers[4].conv2, tolerance=tolerance) utils.check_conv_grad_match(net.layers[4].conv_layers[0], torch_net.layers[4].conv1, tolerance=tolerance) utils.check_conv_grad_match(net.layers[3], torch_net.layers[3], tolerance=tolerance) utils.check_conv_grad_match(net.layers[0], torch_net.layers[0], tolerance=tolerance)