Beispiel #1
0
class DistanceTest(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=4, dtype=np.float32),
           **hu.gcs)
    def test_L1_distance(self, inputs, gc, dc):
        X, Y = inputs
        # avoid kinks by moving away from 0
        X += 0.02 * np.sign(X - Y)
        X[(X - Y) == 0.0] += 0.02

        self.ws.create_blob("X").feed(X)
        self.ws.create_blob("Y").feed(Y)
        op = core.CreateOperator(
            'L1Distance',
            ['X', 'Y'],
            ['l1_dist'],
        )
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(),
                                   np.linalg.norm((X - Y).flatten(), ord=1),
                                   rtol=1e-4,
                                   atol=1e-4)

        self.assertDeviceChecks(dc, op, [X, Y], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc,
                                  op, [X, Y],
                                  0, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)
        # Gradient check wrt Y
        self.assertGradientChecks(gc,
                                  op, [X, Y],
                                  1, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)

    @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32),
           **hu.gcs)
    def test_dot_product(self, inputs, gc, dc):
        X, Y = inputs
        op = core.CreateOperator(
            'DotProduct',
            ['X', 'Y'],
            ['DOT'],
        )

        def dot_ref(X, Y):
            return ([np.dot(x, y) for x, y in zip(X, Y)], )

        # Check against numpy dot reference
        self.assertReferenceChecks(gc, op, [X, Y], dot_ref)
        # Check over multiple devices
        self.assertDeviceChecks(dc, op, [X, Y], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
        # Gradient check wrt Y
        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
Beispiel #2
0
class TestATen(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2), **hu.gcs)
    def test_add(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add")

        def ref(X, Y):
            return [X + Y]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_pow(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"],
                                 operator="pow",
                                 exponent=2.0)

        def ref(X):
            return [np.square(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
    def test_sort(self, x, gc, dc):
        inputs = [np.random.permutation(x)]
        op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort")

        def ref(X):
            return [np.sort(X), np.argsort(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_sum(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum")

        def ref(X):
            return [np.sum(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(**hu.gcs)
    def test_ones(self, gc, dc):
        op = core.CreateOperator("ATen", [], ["Z"],
                                 operator="ones",
                                 type="float",
                                 size={2, 4})

        def ref():
            return [np.ones([2, 4])]

        self.assertReferenceChecks(gc, op, [], ref)
class TestSparseNormalize(hu.HypothesisTestCase):
    @staticmethod
    def ref_normalize(param_in, use_max_norm, norm):
        param_norm = np.linalg.norm(param_in) + 1e-12
        if (use_max_norm and param_norm > norm) or not use_max_norm:
            param_in = param_in * norm / param_norm
        return param_in

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much])
    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2),
           use_max_norm=st.booleans(),
           norm=st.floats(min_value=1.0, max_value=4.0),
           data_strategy=st.data(),
           **hu.gcs_cpu_only)
    def test_sparse_normalize(self, inputs, use_max_norm, norm, data_strategy,
                              gc, dc):
        param, grad = inputs
        param += 0.02 * np.sign(param)
        param[param == 0.0] += 0.02

        # Create an indexing array containing values that are lists of indices,
        # which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      min_dim=1,
                      max_dim=1,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )
        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # For now, the indices must be unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseNormalize",
            ["param", "indices", "grad"],
            ["param"],
            use_max_norm=use_max_norm,
            norm=norm,
        )

        def ref_sparse_normalize(param, indices, grad):
            param_out = np.copy(param)
            for _, index in enumerate(indices):
                param_out[index] = self.ref_normalize(
                    param[index],
                    use_max_norm,
                    norm,
                )
            return (param_out, )

        # self.assertDeviceChecks(dc, op, [param, indices, grad], [0])
        self.assertReferenceChecks(gc, op, [param, indices, grad],
                                   ref_sparse_normalize)
Beispiel #4
0
class TestWeightScale(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=1),
           ITER=st.integers(min_value=0, max_value=100),
           stepsize=st.integers(min_value=20, max_value=50),
           upper_bound_iter=st.integers(min_value=5, max_value=100),
           scale=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_weight_scale(self, inputs, ITER, stepsize, upper_bound_iter,
                          scale, gc, dc):
        ITER = np.array([ITER], dtype=np.int64)
        op = core.CreateOperator("WeightScale", ["w", "iter"], ["nw"],
                                 stepsize=stepsize,
                                 upper_bound_iter=upper_bound_iter,
                                 scale=scale)

        def ref_weight_scale(w, iter, stepsize, upper_bound_iter, scale):
            iter = iter + 1
            return [
                w * scale
                if iter % stepsize == 0 and iter < upper_bound_iter else w
            ]

        self.assertReferenceChecks(
            gc, op, [inputs[0], ITER],
            functools.partial(ref_weight_scale,
                              stepsize=stepsize,
                              upper_bound_iter=upper_bound_iter,
                              scale=scale))
class ReluTest(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32))
    def relu_test(self, inputs, gc, dc):
        X = inputs[0]
        # First dimension is the batch size
        print(X.shape)
        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(core.CreateOperator("Relu", ["X"], ["Y"]))
        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "ref"
        pred_net_ref.external_input.extend(["X"])
        pred_net_ref.external_output.append("Y_ref")
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "ReluFakeFp16",
                ["X"],
                ["Y_ref"],
            ))

        shape_hints = {"X": X.shape}
        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                shape_hints,
                                                debug=True,
                                                adjust_batch=True,
                                                use_onnx=False)
        print(pred_net_onnxified)
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.SwitchWorkspace("glow_test_ws", True)
        workspace.FeedBlob("X", X)

        workspace.CreateNet(pred_net_ref)
        workspace.CreateNet(pred_net_onnxified)
        workspace.FeedBlob("X", X)
        # Run caffe2 net
        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob("Y_ref")

        # Run Glow net
        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        # Results should be identical since we are comparing with the C2 emulation
        if not np.allclose(Y_c2, Y_glow):
            diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
            print_test_debug_info("Relu", {
                "X": X,
                "Y_glow": Y_glow,
                "Y_c2": Y_c2,
                "diff": diff
            })
            assert (0)
class LpnormTest(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32),
           **hu.gcs_cpu_only)
    def test_Lp_Norm(self, inputs, gc, dc):
        X = inputs[0]
        # avoid kinks by moving away from 0
        X += 0.02 * np.sign(X)
        X[X == 0.0] += 0.02
        self.ws.create_blob("X").feed(X)
        op = core.CreateOperator(
            'LpNorm',
            ['X'],
            ['l1_norm'],
            p=1,
        )
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(),
                                   np.linalg.norm((X).flatten(), ord=1),
                                   rtol=1e-4,
                                   atol=1e-4)

        self.assertDeviceChecks(dc, op, [X], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc,
                                  op, [X],
                                  0, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)

        op = core.CreateOperator(
            'LpNorm',
            ['X'],
            ['l2_norm'],
            p=2,
        )
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l2_norm")].fetch(),
                                   np.linalg.norm((X).flatten(), ord=2)**2,
                                   rtol=1e-4,
                                   atol=1e-4)

        self.assertDeviceChecks(dc, op, [X], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc,
                                  op, [X],
                                  0, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)
Beispiel #7
0
class DistanceTest(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32),
           **hu.gcs)
    def test_dot_product(self, inputs, gc, dc):
        X, Y = inputs
        op = core.CreateOperator(
            'DotProduct',
            ['X', 'Y'],
            ['DOT'],
        )

        def dot_ref(X, Y):
            return ([np.dot(x, y) for x, y in zip(X, Y)], )

        # Check against numpy dot reference
        self.assertReferenceChecks(gc, op, [X, Y], dot_ref)
        # Check over multiple devices
        self.assertDeviceChecks(dc, op, [X, Y], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
        # Gradient check wrt Y
        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
Beispiel #8
0
class TestAdam(hu.HypothesisTestCase):

    @staticmethod
    def ref_adam(param, mom1, mom2, grad, LR, ITER,
                 beta1, beta2, epsilon):
        t = ITER + 1
        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
            (1 - np.power(beta1, t))
        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
        mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad)
        param_out = param + corrected_local_rate * mom1_out / \
            (np.sqrt(mom2_out) + epsilon)
        return param_out, mom1_out, mom2_out

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs)
    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        op = core.CreateOperator(
            "Adam",
            ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, grad, LR, ITER],
            functools.partial(
                self.ref_adam,
                beta1=beta1, beta2=beta2, epsilon=epsilon),
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs)
    def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                         gc, dc):
        param, mom1, mom2, grad = inputs
        mom1 = np.absolute(mom1)
        mom2 = np.absolute(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        indices = np.arange(grad.shape[0])
        indices = indices[indices % 2 == 0]
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index] = \
                    self.ref_adam(param[index], mom1[index], mom2[index],
                                  grad[i], LR, ITER,
                                  beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            ref_sparse,
            input_device_options=input_device_options)
Beispiel #9
0
class TestAdam(hu.HypothesisTestCase):
    @staticmethod
    def ref_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon):
        t = ITER + 1
        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
            (1 - np.power(beta1, t))
        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
        mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad)
        param_out = param + corrected_local_rate * mom1_out / \
            (np.sqrt(mom2_out) + epsilon)
        return param_out, mom1_out, mom2_out

    @staticmethod
    def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2,
                          epsilon):
        t = ITER + 1
        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
            (1 - np.power(beta1, t))
        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
        mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
        param_out = param + corrected_local_rate * mom1_out / \
            (np.sqrt(mom2_out) + epsilon)
        return (param_out, mom1_out, mom2_out)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           beta1=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           beta2=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        op = core.CreateOperator(
            "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2"],
            beta1=beta1,
            beta2=beta2,
            epsilon=epsilon)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(gc,
                                   op, [param, mom1, mom2, grad, LR, ITER],
                                   functools.partial(self.ref_adam,
                                                     beta1=beta1,
                                                     beta2=beta2,
                                                     epsilon=epsilon),
                                   input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           beta1=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           beta2=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                         data_strategy, gc, dc):
        param, mom1, mom2, grad = inputs
        mom2 = np.absolute(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ), )

        # Verify that the generated indices are unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2"],
            beta1=beta1,
            beta2=beta2,
            epsilon=epsilon)

        def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index] = \
                    self.ref_adam(param[index], mom1[index], mom2[index],
                                  grad[i], LR, ITER,
                                  beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc,
            op, [param, mom1, mom2, indices, grad, LR, ITER],
            ref_sparse,
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=3),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           beta1=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           beta2=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs_cpu_only)
    def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2,
                                  epsilon, data_strategy, gc, dc):
        param, mom1, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create a 1D row-wise average 2nd moment tensor.
        mom2 = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0],
                        max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32)))
        mom2 = np.absolute(mom2)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ), )

        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # Verify that the generated indices are unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2"],
            beta1=beta1,
            beta2=beta2,
            epsilon=epsilon)

        def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index] = \
                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
                                           grad[i], LR, ITER,
                                           beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc,
            op, [param, mom1, mom2, indices, grad, LR, ITER],
            ref_row_wise_sparse,
            input_device_options=input_device_options)
Beispiel #10
0
class TestAdagrad(hu.HypothesisTestCase):
    @staticmethod
    def ref_adagrad(param_in, mom_in, grad, lr, epsilon):
        mom_out = mom_in + np.square(grad)
        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
        param_out = param_in + grad_adj
        return (param_out, mom_out)

    @staticmethod
    def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
        mom_out = mom_in + np.mean(np.square(grad))
        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
        param_out = param_in + grad_adj
        return (param_out, mom_out)

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_adagrad(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(self.ref_adagrad, epsilon=epsilon))

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc):
        param, momentum, grad = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        # Create an indexing array containing values that are lists of indices,
        # which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )
        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # For now, the indices must be unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            for i, index in enumerate(indices):
                param_out[index], momentum_out[index] = self.ref_adagrad(
                    param[index], momentum[index], grad[i], lr, epsilon)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_sparse)

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc,
                                  dc):
        param, momentum = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_sparse)

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy,
                                     gc, dc):
        param, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        # Create a 1D row-wise average sum of squared gradients tensor.
        momentum = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0],
                        max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32)))
        momentum = np.abs(momentum)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )

        # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # The indices must be unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdagrad",
            ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            for i, index in enumerate(indices):
                param_out[index], momentum_out[
                    index] = self.ref_row_wise_adagrad(param[index],
                                                       momentum[index],
                                                       grad[i], lr, epsilon)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_row_wise_sparse)

    @given(inputs=hu.tensors(n=1),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon,
                                           data_strategy, gc, dc):
        param = inputs[0]
        lr = np.array([lr], dtype=np.float32)

        momentum = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0],
                        max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32)))
        momentum = np.abs(momentum)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "RowWiseSparseAdagrad",
            ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_row_wise_sparse)
Beispiel #11
0
class TestMomentumSGD(serial.SerializedTestCase):
    @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
    def test_momentum_sgd(self, n, nesterov, gc, dc):
        param = np.random.rand(n).astype(np.float32)
        grad = np.random.rand(n).astype(np.float32)
        lr = np.random.rand(1).astype(np.float32)
        param_momentum = np.random.rand(n).astype(np.float32)
        momentum = 0.9

        def momentum_sgd(grad, param_momentum, lr, param=None):
            if not nesterov:
                adjusted_gradient = lr * grad + momentum * param_momentum
                if param is None:
                    return [adjusted_gradient, adjusted_gradient]
                else:
                    paramup = param - adjusted_gradient
                    return [adjusted_gradient, adjusted_gradient, paramup]
            else:
                m_new = momentum * param_momentum + lr * grad
                grad_new = (1 + momentum) * m_new - momentum * param_momentum
                if param is None:
                    return [grad_new, m_new]
                else:
                    paramup = param - grad_new
                    return [grad_new, m_new, paramup]

        op = core.CreateOperator(
            "MomentumSGDUpdate",
            ["grad", "param_momentum", "lr", "param"],
            ["grad", "param_momentum", "param"],
            momentum=momentum,
            nesterov=int(nesterov),
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[grad, param_momentum, lr, param],
                                   reference=momentum_sgd)

        op_noparam = core.CreateOperator(
            "MomentumSGD",
            ["grad", "param_momentum", "lr"],
            ["grad", "param_momentum"],
            momentum=momentum,
            nesterov=int(nesterov),
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op_noparam,
                                   inputs=[grad, param_momentum, lr],
                                   reference=momentum_sgd)

    @serial.given(inputs=hu.tensors(n=3),
                  momentum=st.floats(min_value=0.1, max_value=0.9),
                  nesterov=st.booleans(),
                  lr=st.floats(min_value=0.1, max_value=0.9),
                  data_strategy=st.data(),
                  **hu.gcs)
    def test_sparse_momentum_sgd(self, inputs, momentum, nesterov, lr,
                                 data_strategy, gc, dc):
        w, grad, m = inputs

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ), )

        # Verify that the generated indices are unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        # Make momentum >= 0
        m = np.abs(m)

        # Convert lr to a numpy array
        lr = np.asarray([lr], dtype=np.float32)

        op = core.CreateOperator("SparseMomentumSGDUpdate",
                                 ["grad", "m", "lr", "param", "indices"],
                                 ["adjusted_grad", "m", "param"],
                                 momentum=momentum,
                                 nesterov=int(nesterov),
                                 device_option=gc)

        # Reference
        def momentum_sgd(grad, m, lr):
            lr = lr[0]
            if not nesterov:
                adjusted_gradient = lr * grad + momentum * m
                return (adjusted_gradient, adjusted_gradient)
            else:
                m_new = momentum * m + lr * grad
                return ((1 + momentum) * m_new - momentum * m, m_new)

        def sparse(grad, m, lr, param, i):
            grad_new, m_new = momentum_sgd(grad, m[i], lr)
            m[i] = m_new
            param[i] -= grad_new
            return (grad_new, m, param)

        self.assertReferenceChecks(gc, op, [grad, m, lr, w, indices], sparse)

    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs_gpu_only)
    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
    def test_fp16momentum_sgd(self, n, nesterov, gc, dc):
        gpuvers = workspace.GetDeviceProperties(0)["major"]
        if gpuvers < 6:
            print(
                "No FP16 support because major version {} < 6".format(gpuvers))
            return

        param = np.random.rand(n).astype(np.float16)
        grad = np.random.rand(n).astype(np.float16)
        lr = np.random.rand(1).astype(np.float32)
        param_momentum = np.random.rand(n).astype(np.float16)
        momentum = 0.9
        nesterov = True

        def momentum_sgd(grad, param_momentum, lr, param=None):
            if not nesterov:
                adjusted_gradient = lr * grad + momentum * param_momentum
                paramup = param - adjusted_gradient
                return [adjusted_gradient, adjusted_gradient, paramup]
            else:
                m_new = momentum * param_momentum + lr * grad
                grad_new = (1 + momentum) * m_new - momentum * param_momentum
                paramup = param - grad_new
                return [grad_new, m_new, paramup]

        op = core.CreateOperator(
            "FP16MomentumSGDUpdate",
            ["grad", "param_momentum", "lr", "param"],
            ["grad", "param_momentum", "param"],
            momentum=momentum,
            nesterov=int(nesterov),
            weight_decay=0.0,
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[grad, param_momentum, lr, param],
                                   reference=momentum_sgd)
Beispiel #12
0
class TestAdagrad(serial.SerializedTestCase):
    @staticmethod
    def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
        mom_out = mom_in + np.mean(np.square(grad))
        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
        param_out = param_in + grad_adj
        return (param_out, mom_out)

    @serial.given(inputs=hu.tensors(n=3),
                  lr=st.floats(min_value=0.01,
                               max_value=0.99,
                               allow_nan=False,
                               allow_infinity=False),
                  epsilon=st.floats(min_value=0.01,
                                    max_value=0.99,
                                    allow_nan=False,
                                    allow_infinity=False),
                  **hu.gcs)
    def test_adagrad(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(ref_adagrad, epsilon=epsilon))

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_adagrad_output_effective_lr(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum", "effective_lr"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(ref_adagrad,
                              epsilon=epsilon,
                              output_effective_lr=True))

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_adagrad_output_effective_lr_and_update(self, inputs, lr, epsilon,
                                                    gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum", "effective_lr", "update"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(ref_adagrad,
                              epsilon=epsilon,
                              output_effective_lr_and_update=True))

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much])
    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc):
        return adagrad_sparse_test_helper(self, inputs, lr, epsilon, None,
                                          ref_adagrad, gc, dc)

    @serial.given(inputs=hu.tensors(n=2),
                  lr=st.floats(min_value=0.01,
                               max_value=0.99,
                               allow_nan=False,
                               allow_infinity=False),
                  epsilon=st.floats(min_value=0.01,
                                    max_value=0.99,
                                    allow_nan=False,
                                    allow_infinity=False),
                  data_strategy=st.data(),
                  **hu.gcs)
    def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc,
                                  dc):
        param, momentum = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            return (param_out, momentum_out)

        ref_using_fp16_values = [False]
        if dc == hu.gpu_do:
            ref_using_fp16_values.append(True)

        for ref_using_fp16 in ref_using_fp16_values:
            if (ref_using_fp16):
                print(
                    'test_sparse_adagrad_empty with half precision embedding')
                momentum_i = momentum.astype(np.float16)
                param_i = param.astype(np.float16)
            else:
                print(
                    'test_sparse_adagrad_empty with full precision embedding')
                momentum_i = momentum.astype(np.float32)
                param_i = param.astype(np.float32)

            self.assertReferenceChecks(
                gc, op, [param_i, momentum_i, indices, grad, lr], ref_sparse)

    @unittest.skipIf("IN_CIRCLECI" in os.environ,
                     "FIXME: flaky test in CircleCI")
    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much])
    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy,
                                     gc, dc):
        param, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        # Create a 1D row-wise average sum of squared gradients tensor.
        momentum = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0],
                        max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32)))
        momentum = np.abs(momentum)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )

        # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # The indices must be unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdagrad",
            ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            for i, index in enumerate(indices):
                param_out[index], momentum_out[
                    index] = self.ref_row_wise_adagrad(param[index],
                                                       momentum[index],
                                                       grad[i], lr, epsilon)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_row_wise_sparse)

    @serial.given(inputs=hu.tensors(n=1),
                  lr=st.floats(min_value=0.01,
                               max_value=0.99,
                               allow_nan=False,
                               allow_infinity=False),
                  epsilon=st.floats(min_value=0.01,
                                    max_value=0.99,
                                    allow_nan=False,
                                    allow_infinity=False),
                  data_strategy=st.data(),
                  **hu.gcs)
    def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon,
                                           data_strategy, gc, dc):
        param = inputs[0]
        lr = np.array([lr], dtype=np.float32)

        momentum = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0],
                        max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32)))
        momentum = np.abs(momentum)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "RowWiseSparseAdagrad",
            ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_row_wise_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_row_wise_sparse)
Beispiel #13
0
class TestATen(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2), **hu.gcs)
    def test_add(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add")

        def ref(X, Y):
            return [X + Y]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only)
    def test_add_half(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add")

        def ref(X, Y):
            return [X + Y]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_pow(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"],
                                 operator="pow",
                                 exponent=2.0)

        def ref(X):
            return [np.square(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
    def test_sort(self, x, gc, dc):
        inputs = [np.random.permutation(x)]
        op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort")

        def ref(X):
            return [np.sort(X), np.argsort(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_sum(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum")

        def ref(X):
            return [np.sum(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(**hu.gcs)
    def test_ones(self, gc, dc):
        op = core.CreateOperator("ATen", [], ["Z"],
                                 operator="ones",
                                 type="float",
                                 size={2, 4})

        def ref():
            return [np.ones([2, 4])]

        self.assertReferenceChecks(gc, op, [], ref)

    @given(**hu.gcs)
    def test_index_put(self, gc, dc):
        op = core.CreateOperator("ATen", ['self', 'indices', 'values'], ["Z"],
                                 operator="index_put")

        def ref(self, indices, values):
            self[indices] = values
            return (self, )

        tensor = np.random.randn(3, 3).astype(np.float32)
        mask = np.array([[True, True, True], [True, False, False],
                         [True, True, False]])
        values = np.random.randn(6).astype(np.float32)

        self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
Beispiel #14
0
class TestAdagrad(serial.SerializedTestCase):
    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           weight_decay=st.sampled_from([0.0, 0.1]),
           **hu.gcs)
    @settings(deadline=10000)
    def test_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc):
        param, momentum, grad = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            weight_decay=weight_decay,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc,
            op,
            [param, momentum, grad, lr],
            functools.partial(ref_adagrad,
                              epsilon=epsilon,
                              weight_decay=weight_decay),
        )

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           weight_decay=st.sampled_from([0.0, 0.1]),
           **hu.gcs_cpu_only)
    @settings(deadline=10000)
    def test_adagrad_output_effective_lr(self, inputs, lr, epsilon,
                                         weight_decay, gc, dc):
        param, momentum, grad = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum", "effective_lr"],
            epsilon=epsilon,
            weight_decay=weight_decay,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc,
            op,
            [param, momentum, grad, lr],
            functools.partial(
                ref_adagrad,
                epsilon=epsilon,
                output_effective_lr=True,
                weight_decay=weight_decay,
            ),
        )

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs_cpu_only)
    @settings(deadline=10000)
    def test_adagrad_output_effective_lr_and_update(self, inputs, lr, epsilon,
                                                    gc, dc):
        param, momentum, grad = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum", "effective_lr", "update"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc,
            op,
            [param, momentum, grad, lr],
            functools.partial(ref_adagrad,
                              epsilon=epsilon,
                              output_effective_lr_and_update=True),
        )

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much],
              deadline=10000)
    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           weight_decay=st.sampled_from([0.0, 0.1]),
           **hu.gcs)
    def test_sparse_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc):
        adagrad_sparse_test_helper(
            self,
            inputs,
            lr,
            epsilon,
            None,
            ref_adagrad,
            gc,
            dc,
            weight_decay=weight_decay,
        )

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    @settings(deadline=10000)
    def test_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc):
        param, momentum = inputs
        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)

        ref_using_fp16_values = [False]
        if gc == hu.gpu_do:
            ref_using_fp16_values.append(True)

        for ref_using_fp16 in ref_using_fp16_values:
            if ref_using_fp16:
                print(
                    "test_sparse_adagrad_empty with half precision embedding")
                momentum_i = momentum.astype(np.float16)
                param_i = param.astype(np.float16)
            else:
                print(
                    "test_sparse_adagrad_empty with full precision embedding")
                momentum_i = momentum.astype(np.float32)
                param_i = param.astype(np.float32)

            adagrad_sparse_test_helper(
                self,
                [param_i, momentum_i, grad],
                lr,
                epsilon,
                None,
                ref_adagrad,
                gc,
                dc,
            )

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much],
              deadline=10000)
    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           weight_decay=st.sampled_from([0.0, 0.1]),
           **hu.gcs)
    def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, weight_decay,
                                     gc, dc):
        adagrad_sparse_test_helper(
            self,
            inputs,
            lr,
            epsilon,
            None,
            functools.partial(ref_adagrad, row_wise=True),
            gc,
            dc,
            row_wise=True,
            weight_decay=weight_decay,
        )

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    @settings(deadline=None)
    def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc):
        param, momentum = inputs
        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        adagrad_sparse_test_helper(
            self,
            [param, momentum, grad],
            lr,
            epsilon,
            None,
            ref_adagrad,
            gc,
            dc,
            row_wise=True,
        )
Beispiel #15
0
class TestATen(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2), **hu.gcs)
    def test_add(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add")

        def ref(X, Y):
            return [X + Y]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only)
    def test_add_half(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add")

        def ref(X, Y):
            return [X + Y]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_pow(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"],
                                 operator="pow",
                                 exponent=2.0)

        def ref(X):
            return [np.square(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
    def test_sort(self, x, gc, dc):
        inputs = [np.random.permutation(x)]
        op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort")

        def ref(X):
            return [np.sort(X), np.argsort(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(inputs=hu.tensors(n=1), **hu.gcs)
    def test_sum(self, inputs, gc, dc):
        op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum")

        def ref(X):
            return [np.sum(X)]

        self.assertReferenceChecks(gc, op, inputs, ref)

    @given(**hu.gcs)
    def test_index_uint8(self, gc, dc):
        # Indexing with uint8 is deprecated, but we need to provide backward compatibility for some old models exported through ONNX
        op = core.CreateOperator("ATen", ['self', 'mask'], ["Z"],
                                 operator="index")

        def ref(self, mask):
            return (self[mask.astype(np.bool)], )

        tensor = np.random.randn(2, 3, 4).astype(np.float32)
        mask = np.array([[1, 0, 0], [1, 1, 0]]).astype(np.uint8)

        self.assertReferenceChecks(gc, op, [tensor, mask], ref)

    @given(**hu.gcs)
    def test_index_put(self, gc, dc):
        op = core.CreateOperator("ATen", ['self', 'indices', 'values'], ["Z"],
                                 operator="index_put")

        def ref(self, indices, values):
            self[indices] = values
            return (self, )

        tensor = np.random.randn(3, 3).astype(np.float32)
        mask = np.array([[True, True, True], [True, False, False],
                         [True, True, False]])
        values = np.random.randn(6).astype(np.float32)

        self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)

    @given(**hu.gcs)
    def test_unique(self, gc, dc):
        op = core.CreateOperator(
            "ATen",
            ['self'],
            ["output"],
            sorted=True,
            return_inverse=True,
            # return_counts=False,
            operator="_unique")

        def ref(self):
            index, _ = np.unique(self,
                                 return_index=False,
                                 return_inverse=True,
                                 return_counts=False)
            return (index, )

        tensor = np.array([1, 2, 6, 4, 2, 3, 2])
        print(ref(tensor))
        self.assertReferenceChecks(gc, op, [tensor], ref)
Beispiel #16
0
class PythonOpTest(hu.HypothesisTestCase):
    @given(x=hu.tensor())
    def test_feed(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(type(inputs[0].shape), tuple)
            self.assertEqual(type(inputs[0].data), np.ndarray)
            np.testing.assert_almost_equal(x, inputs[0].data)

        op = CreatePythonOperator(f, ["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    @given(x=hu.tensor())
    def test_feed_with_helper_function(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(type(inputs[0].shape), tuple)
            self.assertEqual(type(inputs[0].data), np.ndarray)
            np.testing.assert_almost_equal(x, inputs[0].data)

        net = core.Net("test")
        net.Python(f)(["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunNetOnce(net)

    @given(x=hu.tensor())
    def test_feed_with_gc(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            np.testing.assert_almost_equal(x, inputs[0].data)

        op = CreatePythonOperator(f, ["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)
        del f
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    @given(x=hu.tensor())
    def test_reshape(self, x):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(x.shape, outputs[0].shape)
            outputs[0].data[...] = inputs[0].data

        op = CreatePythonOperator(f, ["x"], ["y"])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)
        y = workspace.FetchBlob("y")
        np.testing.assert_almost_equal(x, y)

    @given(x=hu.tensor())
    def test_caught_exception_doesnt_terminate(self, x):
        def f(inputs, outputs):
            try:
                raise Exception("Exception in handler")
            except:
                pass

        op = CreatePythonOperator(f, ["x"], ["y"])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    @given(x=hu.tensor(),
           n=st.integers(min_value=1, max_value=20),
           w=st.integers(min_value=1, max_value=20))
    def test_multithreaded_evaluation(self, x, n, w):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            outputs[0].data[...] = inputs[0].data

        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
        net = core.Net("net")
        net.Proto().op.extend(ops)
        net.Proto().type = "dag"
        net.Proto().num_workers = w
        iters = 100
        plan = core.Plan("plan")
        plan.AddStep(core.ExecutionStep("test-step", net, iters))
        workspace.FeedBlob("x", x)
        workspace.RunPlan(plan.Proto().SerializeToString())
        for i in range(n):
            y = workspace.FetchBlob(str(i))
            np.testing.assert_almost_equal(x, y)

    @given(x=hu.tensor(), in_place=st.booleans())
    def test_gradient(self, x, in_place):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            outputs[0].data[...] = inputs[0].data * 2

        def grad_f(inputs, outputs):
            # Ordering is [inputs, outputs, grad_outputs]
            grad_output = inputs[2]

            grad_input = outputs[0]
            grad_input.reshape(grad_output.shape)
            grad_input.data[...] = grad_output.data * 2

        op = CreatePythonOperator(f, ["x"], ["x" if in_place else "y"],
                                  grad_f=grad_f)
        self.assertGradientChecks(hu.cpu_do, op, [x], 0, [0])

    @given(inputs=hu.tensors(n=2))
    def test_gradient_multiple(self, inputs):
        (x1, x2) = inputs

        def f(inputs, outputs):
            for idx in [0, 1]:
                self.assertEqual(type(inputs[idx].shape), tuple)
                outputs[idx].reshape(inputs[idx].shape)
                outputs[idx].data[...] = inputs[idx].data * 2

        def grad_f(inputs, outputs):
            # Ordering is [inputs, outputs, grad_outputs]
            self.assertEqual(len(inputs), 6)
            self.assertEqual(len(outputs), 2)
            for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]:
                grad_output = inputs[grad_output_idx]
                grad_input = outputs[grad_input_idx]
                grad_input.reshape(grad_output.shape)
                grad_input.data[...] = grad_output.data * 2

        op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f)

        for idx in [0, 1]:
            self.assertGradientChecks(hu.cpu_do, op, [x1, x2], idx, [0, 1])
class TestMomentumSGD(hu.HypothesisTestCase):
    @given(n=st.integers(4, 8), **hu.gcs)
    def test_momentum_sgd(self, n, gc, dc):
        param = np.random.rand(n).astype(np.float32)
        grad = np.random.rand(n).astype(np.float32)
        lr = np.random.rand(1).astype(np.float32)
        param_momentum = np.random.rand(n).astype(np.float32)
        momentum = 0.9

        def momentum_sgd(grad, param_momentum, lr, param=None):
            adjgrad = lr * grad + momentum * param_momentum
            if param is None:
                return [adjgrad, adjgrad]
            else:
                paramup = param - adjgrad
                return [adjgrad, adjgrad, paramup]

        op = core.CreateOperator(
            "MomentumSGDUpdate",
            ["grad", "param_momentum", "lr", "param"],
            ["grad", "param_momentum", "param"],
            momentum=momentum,
            nesterov=0,
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[grad, param_momentum, lr, param],
                                   reference=momentum_sgd)

        op_noparam = core.CreateOperator(
            "MomentumSGD",
            ["grad", "param_momentum", "lr"],
            ["grad", "param_momentum"],
            momentum=momentum,
            nesterov=0,
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op_noparam,
                                   inputs=[grad, param_momentum, lr],
                                   reference=momentum_sgd)

    @given(inputs=hu.tensors(n=3),
           momentum=st.floats(min_value=0.1, max_value=0.9),
           nesterov=st.booleans(),
           lr=st.floats(min_value=0.1, max_value=0.9),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_momentum_sgd(self, inputs, momentum, nesterov, lr,
                                 data_strategy, gc, dc):
        w, grad, m = inputs

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )
        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # For now, the indices must be unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]
        # Make momentum >= 0
        m = np.abs(m)
        # Convert lr to a numpy array
        lr = np.asarray([lr], dtype=np.float32)

        op = core.CreateOperator("SparseMomentumSGDUpdate",
                                 ["grad", "m", "lr", "param", "indices"],
                                 ["adjusted_grad", "m", "param"],
                                 momentum=momentum,
                                 nesterov=int(nesterov),
                                 device_option=gc)

        # Reference
        def momentum_sgd(grad, m, lr):
            lr = lr[0]
            if not nesterov:
                adjusted_gradient = lr * grad + momentum * m
                return (adjusted_gradient, adjusted_gradient)
            else:
                m_new = momentum * m + lr * grad
                return ((1 + momentum) * m_new - momentum * m, m_new)

        def sparse(grad, m, lr, param, i):
            grad_new, m_new = momentum_sgd(grad, m[i], lr)
            m[i] = m_new
            param[i] -= grad_new
            return (grad_new, m, param)

        self.assertReferenceChecks(gc, op, [grad, m, lr, w, indices], sparse)
Beispiel #18
0
class TestMaskedAdagrad(hu.HypothesisTestCase):
    @given(
        inputs=hu.tensors(n=3),
        lr=st.floats(
            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
        ),
        epsilon=st.floats(
            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
        ),
    )
    def test_masked_adagrad(self, inputs, lr, epsilon):
        param, moment, grad = inputs
        moment = np.abs(moment)
        lr = np.array([lr], dtype=np.float32)

        mask = np.random.randint(2, size=param.shape).astype(np.float32)

        workspace.FeedBlob("param", param)
        workspace.FeedBlob("moment", moment)
        workspace.FeedBlob("grad", grad)
        workspace.FeedBlob("lr", lr)
        workspace.FeedBlob("mask", mask)

        ref_op = core.CreateOperator(
            "Adagrad",
            ["param", "moment", "grad", "lr"],
            ["out_param_ref", "out_moment_ref"],
            epsilon=epsilon,
        )
        op = core.CreateOperator(
            "MaskedAdagrad",
            ["param", "moment", "grad", "lr", "mask"],
            ["out_param", "out_moment"],
            epsilon=epsilon,
        )

        workspace.RunOperatorOnce(ref_op)
        workspace.RunOperatorOnce(op)

        out_param_ref = workspace.FetchBlob("out_param_ref")
        out_moment_ref = workspace.FetchBlob("out_moment_ref")
        out_param_ref = np.multiply(mask, out_param_ref)
        out_moment_ref = np.multiply(mask, out_moment_ref)

        out_param = workspace.FetchBlob("out_param")
        out_moment = workspace.FetchBlob("out_moment")

        np.testing.assert_array_equal(out_param_ref, out_param)
        np.testing.assert_array_equal(out_moment_ref, out_moment)

    @given(
        inputs=hu.tensors(n=3),
        lr=st.floats(
            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
        ),
        epsilon=st.floats(
            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
        ),
        has_mask_input=st.booleans(),
        has_mask_out=st.booleans(),
        block_size=st.integers(1, 4),
        row_wise=st.booleans(),
    )
    def test_masked_sparse_adagrad(
        self,
        inputs,
        lr,
        epsilon,
        has_mask_input,
        has_mask_out,
        block_size,
        row_wise,
    ):
        param, moment, grad = inputs
        num_rows = param.shape[0]
        if row_wise:
            moment = np.resize(moment, num_rows)
        moment = np.abs(moment)
        lr = np.array([lr], dtype=np.float32)
        param_ref = np.copy(param)
        moment_ref = np.copy(moment)

        indices = np.random.randint(num_rows, size=grad.shape[0])

        workspace.ResetWorkspace()

        row_size = int(np.prod(param.shape[1:]))
        num_blocks_per_row = (row_size + block_size - 1) // block_size
        bitmask_bytes_per_row = (num_blocks_per_row + 7) // 8
        if has_mask_input:
            # Generate a random bit pattern
            mask = np.random.randint(
                np.iinfo(np.uint8).min,
                np.iinfo(np.uint8).max + 1,
                size=[num_rows, bitmask_bytes_per_row],
                dtype=np.uint8,
            )
            workspace.FeedBlob("mask", mask)
        else:
            delays = np.array([1, 2, 3]).astype(np.int32)
            # Make sure to use numbers that can be exactly represented in
            # float32 to avoid potentially different ways of handling floats
            # between Python and C++.
            prune_ratios = np.array([0.5, 0.75, 0.875]).astype(np.float32)

            # Feed empty mask
            workspace.FeedBlob("mask", np.array([]).astype(np.uint8))

        workspace.FeedBlob("param_ref", param_ref)
        workspace.FeedBlob("moment_ref", moment_ref)
        workspace.FeedBlob("param", param)
        workspace.FeedBlob("moment", moment)
        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("grad", grad)
        workspace.FeedBlob("lr", lr)

        net = core.Net("test_net")

        prefix = "RowWise" if row_wise else ""
        ref_op = core.CreateOperator(
            prefix + "SparseAdagrad",
            ["param_ref", "moment_ref", "indices", "grad", "lr"],
            ["param_ref", "moment_ref"],
            epsilon=epsilon,
        )

        inputs = ["param", "moment", "indices", "grad", "lr", "mask", "mask_changed"]
        outputs = ["param", "moment"]
        if not has_mask_input:
            inputs += ["iter"]
            if has_mask_out:
                outputs += ["mask_out"]

        op = core.CreateOperator(
            "Masked" + prefix + "SparseAdagrad",
            inputs,
            outputs,
            epsilon=epsilon,
            block_size=block_size,
            delays=[] if has_mask_input else delays,
            prune_ratios=[] if has_mask_input else prune_ratios,
        )
        net.Proto().op.extend([ref_op, op])

        workspace.FeedBlob("mask_changed", np.array([0]).astype(np.bool))
        workspace.FeedBlob("iter", np.array([0]).astype(np.int64))
        workspace.CreateNet(net)

        if has_mask_input:
            # Test1: if mask_changed == false, only the rows we're updating are masked
            workspace.RunNet(net)

            param_ref = workspace.FetchBlob("param_ref")
            moment_ref = workspace.FetchBlob("moment_ref")
            param = workspace.FetchBlob("param")
            moment = workspace.FetchBlob("moment")

            param_ref = param_ref.reshape(num_rows, -1)
            if not row_wise:
                moment_ref = moment_ref.reshape(num_rows, -1)

            for i in range(grad.shape[0]):
                row = indices[i]
                for j in range(row_size):
                    j_block = j // block_size
                    byte = j_block // 8
                    bit = j_block % 8
                    m = mask[row][byte] & (1 << bit)
                    if not m:
                        param_ref[row, j] = 0
                        if not row_wise:
                            moment_ref[row, j] = 0

            np.testing.assert_array_equal(param_ref, param.reshape(num_rows, -1))
            np.testing.assert_array_equal(
                moment_ref, moment if row_wise else moment.reshape(num_rows, -1)
            )

            # Test2: mask_changed == true
            workspace.FeedBlob("param_ref", param_ref)
            workspace.FeedBlob("moment_ref", moment_ref)
            workspace.FeedBlob("mask_changed", np.array([1]).astype(np.bool))
            workspace.RunNet(net)

            param_ref = workspace.FetchBlob("param_ref")
            moment_ref = workspace.FetchBlob("moment_ref")

            for i in range(num_rows):
                for j in range(row_size):
                    j_block = j // block_size
                    byte = j_block // 8
                    bit = j_block % 8
                    m = mask[i][byte] & (1 << bit)
                    if not m:
                        param_ref[i, j] = 0
                        if not row_wise:
                            moment_ref[i, j] = 0

            param = workspace.FetchBlob("param")
            moment = workspace.FetchBlob("moment")

            np.testing.assert_array_equal(param_ref, param.reshape(num_rows, -1))
            np.testing.assert_array_equal(
                moment_ref, moment if row_wise else moment.reshape(num_rows, -1)
            )
        else:
            # Test1: in the first iteration, there shouldn't be any masking
            workspace.RunNet(net)

            param_ref = workspace.FetchBlob("param_ref")
            moment_ref = workspace.FetchBlob("moment_ref")

            param = workspace.FetchBlob("param")
            moment = workspace.FetchBlob("moment")

            np.testing.assert_array_equal(param_ref, param)
            np.testing.assert_array_equal(moment_ref, moment)

            # Test2: for each pruning delay, masks should be updated accordingly
            for i in range(len(delays)):
                mask = _get_mask(param_ref, block_size, prune_ratios[i])

                workspace.FeedBlob("iter", np.array([delays[i]]).astype(np.int64))
                workspace.RunNet(net)

                param_ref = workspace.FetchBlob("param_ref")
                moment_ref = workspace.FetchBlob("moment_ref")

                param = workspace.FetchBlob("param")
                moment = workspace.FetchBlob("moment")

                param_ref = mask * param_ref.reshape(num_rows, row_size)
                if not row_wise:
                    moment_ref = mask * moment_ref.reshape(num_rows, row_size)

                np.testing.assert_array_equal(param_ref.flatten(), param.flatten())
                np.testing.assert_array_equal(moment_ref.flatten(), moment.flatten())

            # Test3: after finishing delay, mask should be fixed
            workspace.FeedBlob("iter", np.array([delays[-1] + 1]).astype(np.int64))
            workspace.RunNet(net)

            param_ref = workspace.FetchBlob("param_ref")
            moment_ref = workspace.FetchBlob("moment_ref")

            param = workspace.FetchBlob("param")
            moment = workspace.FetchBlob("moment")

            param_ref = mask * param_ref.reshape(num_rows, row_size)
            if not row_wise:
                moment_ref = mask * moment_ref.reshape(num_rows, row_size)

            np.testing.assert_array_equal(param_ref.flatten(), param.flatten())
            np.testing.assert_array_equal(moment_ref.flatten(), moment.flatten())
Beispiel #19
0
class DistanceTest(serial.SerializedTestCase):
    @serial.given(n=st.integers(1, 3),
           dim=st.integers(4, 16),
           **hu.gcs)
    def test_cosine_similarity(self, n, dim, gc, dc):
        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        self.ws.create_blob("X").feed(X)
        self.ws.create_blob("Y").feed(Y)
        kEps = 1e-12
        cos_op = core.CreateOperator("CosineSimilarity", ["X", "Y"], ["cos"])
        self.ws.run(cos_op)
        cos = np.divide(np.multiply(X, Y).sum(axis=1),
                        np.multiply(np.linalg.norm(X, axis=1) + kEps,
                                    np.linalg.norm(Y, axis=1) + kEps))
        np.testing.assert_allclose(self.ws.blobs[("cos")].fetch(), cos,
                                   rtol=1e-4, atol=1e-4)
        self.assertGradientChecks(gc, cos_op, [X, Y], 0, [0],
                                  stepsize=1e-2, threshold=1e-2)
        self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0],
                                  stepsize=1e-2, threshold=1e-2)

    @serial.given(inputs=hu.tensors(n=2,
                             min_dim=1,
                             max_dim=2,
                             dtype=np.float32),
           **hu.gcs)
    def test_dot_product(self, inputs, gc, dc):
        X, Y = inputs
        op = core.CreateOperator(
            'DotProduct',
            ['X', 'Y'],
            ['DOT'],
        )

        def dot_ref(X, Y):
            return ([np.dot(x, y) for x, y in zip(X, Y)],)

        # Check against numpy dot reference
        self.assertReferenceChecks(gc, op, [X, Y], dot_ref)
        # Check over multiple devices
        self.assertDeviceChecks(dc, op, [X, Y], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
        # Gradient check wrt Y
        self.assertGradientChecks(gc, op, [X, Y], 1, [0])

    @serial.given(n=st.integers(1, 3),
           dim=st.integers(4, 16),
           **hu.gcs)
    def test_L1_distance(self, n, dim, gc, dc):
        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        # avoid kinks by moving away from 0
        X += 0.02 * np.sign(X - Y)
        X[(X - Y) == 0.0] += 0.02

        self.ws.create_blob("X").feed(X)
        self.ws.create_blob("Y").feed(Y)
        op = core.CreateOperator(
            'L1Distance',
            ['X', 'Y'],
            ['l1_dist'],
        )
        self.ws.run(op)
        np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(),
                                    [np.linalg.norm(x - y, ord=1)
                                        for x, y in zip(X, Y)],
                                    rtol=1e-4, atol=1e-4)

        self.assertDeviceChecks(dc, op, [X, Y], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc, op, [X, Y], 0, [0],
                                  stepsize=1e-2, threshold=1e-2)
        # Gradient check wrt Y
        self.assertGradientChecks(gc, op, [X, Y], 1, [0],
                                  stepsize=1e-2, threshold=1e-2)

    @serial.given(n=st.integers(1, 3),
           dim=st.integers(4, 16),
           **hu.gcs)
    def test_L2_distance(self, n, dim, gc, dc):
        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
        self.ws.create_blob("X").feed(X)
        self.ws.create_blob("Y").feed(Y)
        l2_op = core.CreateOperator("SquaredL2Distance",
                                    ["X", "Y"], ["l2_dist"])
        self.ws.run(l2_op)
        np.testing.assert_allclose(self.ws.blobs[("l2_dist")].fetch(),
                                   np.square(X - Y).sum(axis=1) * 0.5,
                                   rtol=1e-4, atol=1e-4)
        self.assertGradientChecks(gc, l2_op, [X, Y], 0, [0],
                                  stepsize=1e-2, threshold=1e-2)
        self.assertGradientChecks(gc, l2_op, [X, Y], 1, [0],
                                  stepsize=1e-2, threshold=1e-2)
Beispiel #20
0
class TestAdagrad(hu.HypothesisTestCase):
    @staticmethod
    def ref_adagrad(param_in, mom_in, grad, lr, epsilon):
        mom_out = mom_in + np.square(grad)
        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
        param_out = param_in + grad_adj
        return (param_out, mom_out)

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_adagrad(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(self.ref_adagrad, epsilon=epsilon))

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        indices = np.arange(grad.shape[0])
        indices = indices[indices % 2 == 0]
        grad = grad[indices]
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            for i, index in enumerate(indices):
                param_out[index], momentum_out[index] = self.ref_adagrad(
                    param[index], momentum[index], grad[i], lr, epsilon)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_sparse)
Beispiel #21
0
class TestLayerNormOp(serial.SerializedTestCase):
    @serial.given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_grad_op(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        epsilon = 1e-4
        op = core.CreateOperator(
            "LayerNormGradient",
            ["gout", "out", "mean", "stdev", "in"],
            ["gin"],
            axis=axis,
            epsilon=epsilon,
        )

        norm, mean, stdev = _layer_norm_ref(axis, epsilon, X)
        gout = norm

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[gout, norm, mean, stdev, X],
                                   reference=partial(_layer_norm_grad_ref,
                                                     axis))
        self.assertDeviceChecks(
            device_options=dc,
            op=op,
            inputs=[gout, norm, mean, stdev, X],
            outputs_to_check=[0],
        )

    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_op(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        epsilon = 1e-4
        op = core.CreateOperator(
            "LayerNorm",
            ["input"],
            ["output", "mean", "stdev"],
            axis=axis,
            epsilon=epsilon,
        )

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X],
                                   reference=partial(_layer_norm_ref, axis,
                                                     epsilon))
        self.assertDeviceChecks(
            device_options=dc,
            op=op,
            inputs=[X],
            outputs_to_check=[0, 1, 2],
        )

    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_op_pytorch(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        epsilon = 1e-4

        expected_norm, expected_mean, expected_stdev = _layer_norm_ref(
            axis, epsilon, X)
        actual_norm, actual_mean, actual_stdev = torch.ops.caffe2.layer_norm_dont_use_this_op_yet(
            torch.tensor(X), axis, epsilon)

        torch.testing.assert_allclose(expected_norm, actual_norm)
        torch.testing.assert_allclose(expected_mean, actual_mean)
        torch.testing.assert_allclose(expected_stdev, actual_stdev)

    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_brew_wrapper(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        scale_dim = [1] * np.ndim(X)
        scale_dim[axis] = X.shape[axis]

        self.ws.create_blob('input').feed(X)

        model = ModelHelper(name='test_layer_norm_brew_wrapper')
        brew.layer_norm(
            model,
            'input',
            'output',
            dim_in=X.shape[axis],
            axis=axis,
            epsilon=1e-4,
        )

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()
class TestLayerNormOp(hu.HypothesisTestCase):
    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_grad_op(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        epsilon = 1e-4
        op = core.CreateOperator(
            "LayerNormGradient",
            ["gout", "out", "mean", "stdev", "in"],
            ["gin"],
            axis=axis,
            epsilon=epsilon,
        )

        def layer_norm_ref(X):
            left = int(np.prod(X.shape[:axis]))
            reshaped = np.reshape(X, [left, -1])
            mean = np.mean(reshaped, axis=1).reshape([left, 1])
            stdev = np.sqrt(
                np.mean(np.square(reshaped), axis=1).reshape([left, 1]) -
                np.power(mean, 2) + epsilon)
            norm = (reshaped - mean) / (stdev)
            norm = np.reshape(norm, X.shape)
            mean = np.reshape(mean, X.shape[:axis] + (1, ))
            stdev = np.reshape(stdev, X.shape[:axis] + (1, ))
            return [norm, mean, stdev]

        norm, mean, stdev = layer_norm_ref(X)
        gout = norm

        def layer_norm_grad_ref(gout_full, norm, mean_full, stdev_full,
                                X_full):
            left = int(np.prod(X_full.shape[:axis]))
            right = int(np.prod(X_full.shape[axis:]))
            X = np.reshape(X_full, [left, right])
            stdev = np.reshape(stdev_full, [left, 1])
            mean = np.reshape(mean_full, [left, 1])
            gout = np.reshape(gout_full, [left, right])
            dstdev_end = (-1.0) / np.power(stdev, 2.0) \
                    * np.sum((X - mean) * gout, axis=1).reshape([left, 1])
            dmean_end = np.sum(-1.0 / stdev * gout, axis=1).reshape([left, 1])
            dx_end = 1.0 / stdev * gout

            # stdev block
            dmean_stdev = -1.0 * mean / stdev * dstdev_end
            dx_stdev = X / (right * stdev) * dstdev_end

            # mean block
            dmean = dmean_end + dmean_stdev
            dxmean = (1.0 / right) * dmean

            # final outputs
            dx = dx_end + dx_stdev + dxmean
            dx = dx.reshape(X_full.shape)

            return [dx]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[gout, norm, mean, stdev, X],
                                   reference=layer_norm_grad_ref)
        self.assertDeviceChecks(
            device_options=dc,
            op=op,
            inputs=[gout, norm, mean, stdev, X],
            outputs_to_check=[0],
        )

    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_op(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        epsilon = 1e-4
        op = core.CreateOperator(
            "LayerNorm",
            ["input"],
            ["output", "mean", "stdev"],
            axis=axis,
            epsilon=epsilon,
        )

        def layer_norm_ref(X):
            left = int(np.prod(X.shape[:axis]))
            reshaped = np.reshape(X, [left, -1])
            mean = np.mean(reshaped, axis=1).reshape([left, 1])
            stdev = np.sqrt(
                np.mean(np.power(reshaped, 2), axis=1).reshape([left, 1]) -
                np.power(mean, 2) + epsilon)
            norm = (reshaped - mean) / (stdev)
            norm = np.reshape(norm, X.shape)
            mean = np.reshape(mean, X.shape[:axis] + (1, ))
            stdev = np.reshape(stdev, X.shape[:axis] + (1, ))
            return [norm, mean, stdev]

        self.assertReferenceChecks(device_option=gc,
                                   op=op,
                                   inputs=[X],
                                   reference=layer_norm_ref)
        self.assertDeviceChecks(
            device_options=dc,
            op=op,
            inputs=[X],
            outputs_to_check=[0, 1, 2],
        )

    @given(X=hu.tensors(n=1), **hu.gcs)
    def test_layer_norm_brew_wrapper(self, X, gc, dc):
        X = X[0]
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=0)
        axis = np.random.randint(0, len(X.shape))
        scale_dim = [1] * np.ndim(X)
        scale_dim[axis] = X.shape[axis]

        self.ws.create_blob('input').feed(X)

        model = ModelHelper(name='test_layer_norm_brew_wrapper')
        brew.layer_norm(
            model,
            'input',
            'output',
            dim_in=X.shape[axis],
            axis=axis,
            epsilon=1e-4,
        )

        self.ws.create_net(model.param_init_net).run()
        self.ws.create_net(model.net).run()
Beispiel #23
0
class TestAdam(hu.HypothesisTestCase):

    @staticmethod
    def ref_adam(param, mom1, mom2, grad, LR, ITER,
                 beta1, beta2, epsilon, output_grad=False):
        t = ITER + 1
        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
            (1 - np.power(beta1, t))
        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
        mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad)
        grad_out = corrected_local_rate * mom1_out / \
            (np.sqrt(mom2_out) + epsilon)
        param_out = param + LR * grad_out
        if output_grad:
            return param_out, mom1_out, mom2_out, grad_out
        else:
            return param_out, mom1_out, mom2_out

    @staticmethod
    def ref_smart_decay_adam(param, mom1, mom2, last_seen, grad, LR, ITER,
                             beta1, beta2, epsilon):
        t = ITER + 1

        k = int(np.array(t - last_seen).flatten()[0])
        last_seen_out = t

        if beta1 == 0.0:
            mom1_out = grad
            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
            param_out = param + LR * grad_out
            return param_out, mom1_out, mom2_out, last_seen_out

        # Make up for lost minibatches.
        else:
            mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
            p_out = param
            m = mom1
            # For catchup
            for i in range(k - 1):
                m *= beta1
                update = m / (np.sqrt(mom2_out) + epsilon)
                p_out += LR * update
            # For the single step update
            mom1_out = m * beta1 + grad * (1 - beta1)
            grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
            param_out = p_out + LR * grad_out
            return param_out, mom1_out, mom2_out, last_seen_out

    @staticmethod
    def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
                          beta1, beta2, epsilon, output_grad=False):
        t = ITER + 1
        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
            (1 - np.power(beta1, t))
        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
        mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
        grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon)
        param_out = param + LR * grad_out
        if output_grad:
            return param_out, mom1_out, mom2_out, grad_out
        else:
            return param_out, mom1_out, mom2_out

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs)
    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        mom2 = np.abs(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        op = core.CreateOperator(
            "Adam",
            ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, grad, LR, ITER],
            functools.partial(
                self.ref_adam,
                beta1=beta1, beta2=beta2, epsilon=epsilon),
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        mom2 = np.abs(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        op = core.CreateOperator(
            "Adam",
            ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2", "output_grad"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, grad, LR, ITER],
            functools.partial(
                self.ref_adam,
                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                         data_strategy, gc, dc):
        param, mom1, mom2, grad = inputs
        mom2 = np.absolute(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Verify that the generated indices are unique
        hypothesis.assume(
            np.array_equal(
                np.unique(indices.flatten()),
                np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)

            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index] = \
                    self.ref_adam(param[index], mom1[index], mom2[index],
                                  grad[i], LR, ITER,
                                  beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            ref_sparse,
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_smart_decay_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                                     data_strategy, gc, dc):
        param, mom1, mom2, grad = inputs

        mom2 = np.absolute(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        # Here we will define the last_seen tensor as being randomly from 0 to ITER
        # (the value of t to be tested will be ITER+1)
        last_seen = np.random.randint(low=0, high=ITER + 1, size=param.shape, dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Verify that the generated indices are unique
        hypothesis.assume(
            np.array_equal(
                np.unique(indices.flatten()),
                np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SmartDecaySparseAdam",
            ["param", "mom1", "mom2", "last_seen", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2", "last_seen"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            last_seen_out = np.copy(last_seen)

            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index], last_seen_out[index] = \
                    self.ref_smart_decay_adam(param[index], mom1[index], mom2[index], last_seen[index],
                                              grad[i], LR, ITER,
                                              beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out, last_seen_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, last_seen, indices, grad, LR, ITER],
            ref_sparse,
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
                                     data_strategy, gc, dc):
        param, mom1, mom2, grad = inputs
        mom2 = np.absolute(mom2)
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Verify that the generated indices are unique
        hypothesis.assume(
            np.array_equal(
                np.unique(indices.flatten()),
                np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2", "output_grad"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
                                   beta1, beta2, epsilon, output_grad):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            grad_out = np.copy(grad)

            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
                    self.ref_adam(param[index], mom1[index], mom2[index],
                                  grad[i], LR, ITER,
                                  beta1, beta2, epsilon, output_grad)
            return (param_out, mom1_out, mom2_out, grad_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            functools.partial(
                ref_sparse_output_grad,
                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=3),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                                  data_strategy, gc, dc):
        param, mom1, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create a 1D row-wise average 2nd moment tensor.
        mom2 = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32))
        )
        mom2 = np.absolute(mom2)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # Verify that the generated indices are unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index] = \
                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
                                           grad[i], LR, ITER,
                                           beta1, beta2, epsilon)
            return (param_out, mom1_out, mom2_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertDeviceChecks(
            dc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            [0, 1, 2],
            input_device_options=input_device_options)

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            ref_row_wise_sparse,
            input_device_options=input_device_options)

    @given(inputs=hu.tensors(n=3),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
                                              epsilon, data_strategy, gc, dc):
        param, mom1, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)

        # Create a 1D row-wise average 2nd moment tensor.
        mom2 = data_strategy.draw(
            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
                        elements=hu.elements_of_type(dtype=np.float32))
        )
        mom2 = np.absolute(mom2)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(
                max_dim=1,
                min_value=1,
                max_value=grad.shape[0],
                dtype=np.int64,
                elements=st.sampled_from(np.arange(grad.shape[0])),
            ),
        )

        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
        # tensor that is strictly 1-dimensional and equal in length to the
        # first dimension of the parameters, so indices must also be
        # 1-dimensional.
        indices = indices.flatten()

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # Verify that the generated indices are unique
        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "RowWiseSparseAdam",
            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
            ["param", "mom1", "mom2", "output_grad"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
                                            beta1, beta2, epsilon, output_grad):
            param_out = np.copy(param)
            mom1_out = np.copy(mom1)
            mom2_out = np.copy(mom2)
            grad_out = np.copy(grad)

            for i, index in enumerate(indices):
                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
                                           grad[i], LR, ITER,
                                           beta1, beta2, epsilon, output_grad)
            return (param_out, mom1_out, mom2_out, grad_out)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do}

        self.assertDeviceChecks(
            dc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            [0, 1, 2, 3],
            input_device_options=input_device_options)

        self.assertReferenceChecks(
            gc, op,
            [param, mom1, mom2, indices, grad, LR, ITER],
            functools.partial(
                ref_row_wise_sparse_output_grad,
                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
            input_device_options=input_device_options)
Beispiel #24
0
class LpnormTest(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32),
           **hu.gcs)
    @settings(deadline=1000)
    def test_Lp_Norm(self, inputs, gc, dc):
        X = inputs[0]
        # avoid kinks by moving away from 0
        X += 0.02 * np.sign(X)
        X[X == 0.0] += 0.02
        self.ws.create_blob("X").feed(X)
        op = core.CreateOperator(
            'LpNorm',
            ['X'],
            ['l1_norm'],
            p=1,
        )
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(),
                                   np.linalg.norm((X).flatten(), ord=1),
                                   rtol=1e-4,
                                   atol=1e-4)

        self.assertDeviceChecks(dc, op, [X], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc,
                                  op, [X],
                                  0, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)

        op = core.CreateOperator(
            'LpNorm',
            ['X'],
            ['l2_norm'],
            p=2,
        )
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l2_norm")].fetch(),
                                   np.linalg.norm((X).flatten(), ord=2)**2,
                                   rtol=1e-4,
                                   atol=1e-4)

        self.assertDeviceChecks(dc, op, [X], [0])
        # Gradient check wrt X
        self.assertGradientChecks(gc,
                                  op, [X],
                                  0, [0],
                                  stepsize=1e-2,
                                  threshold=1e-2)

        op = core.CreateOperator('LpNorm', ['X'], ['l2_averaged_norm'],
                                 p=2,
                                 average=True)
        self.ws.run(op)

        np.testing.assert_allclose(self.ws.blobs[("l2_averaged_norm")].fetch(),
                                   np.linalg.norm(
                                       (X).flatten(), ord=2)**2 / X.size,
                                   rtol=1e-4,
                                   atol=1e-4)

    @given(x=hu.tensor(min_dim=1,
                       max_dim=10,
                       dtype=np.float32,
                       elements=st.integers(min_value=-100, max_value=100)),
           p=st.integers(1, 2),
           average=st.integers(0, 1))
    def test_lpnorm_shape_inference(self, x, p, average):
        workspace.FeedBlob('x', x)

        net = core.Net("lpnorm_test")
        result = net.LpNorm(['x'], p=p, average=bool(average))
        (shapes, types) = workspace.InferShapesAndTypes([net])
        workspace.RunNetOnce(net)

        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
        self.assertEqual(types[result], core.DataType.FLOAT)
Beispiel #25
0
class TestAdagrad(hu.HypothesisTestCase):
    @staticmethod
    def ref_adagrad(param_in, mom_in, grad, lr, epsilon):
        mom_out = mom_in + np.square(grad)
        grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon)
        param_out = param_in + grad_adj
        return (param_out, mom_out)

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           **hu.gcs)
    def test_adagrad(self, inputs, lr, epsilon, gc, dc):
        param, momentum, grad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adagrad",
            ["param", "momentum", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, momentum, grad, lr],
            functools.partial(self.ref_adagrad, epsilon=epsilon))

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc):
        param, momentum, grad = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        # Create an indexing array containing values which index into grad
        indices = data_strategy.draw(
            hu.tensor(dtype=np.int64,
                      elements=st.sampled_from(np.arange(grad.shape[0]))), )
        hypothesis.note('indices.shape: %s' % str(indices.shape))

        # For now, the indices must be unique
        hypothesis.assume(
            np.array_equal(np.unique(indices.flatten()),
                           np.sort(indices.flatten())))

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            for i, index in enumerate(indices):
                param_out[index], momentum_out[index] = self.ref_adagrad(
                    param[index], momentum[index], grad[i], lr, epsilon)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_sparse)

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs)
    def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc,
                                  dc):
        param, momentum = inputs
        momentum = np.abs(momentum)
        lr = np.array([lr], dtype=np.float32)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"],
            ["param", "momentum"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, momentum, indices, grad, lr):
            param_out = np.copy(param)
            momentum_out = np.copy(momentum)
            return (param_out, momentum_out)

        self.assertReferenceChecks(gc, op,
                                   [param, momentum, indices, grad, lr],
                                   ref_sparse)
Beispiel #26
0
class TestLearningRateAdaption(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           lr_alpha=st.floats(min_value=0.01,
                              max_value=0.99,
                              allow_nan=False,
                              allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_learning_rate_adaption_op_normalization(self, inputs, lr,
                                                     lr_alpha, gc, dc):
        grad, effgrad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator('LearningRateAdaption',
                                 ['lr', 'grad', 'effgrad'], ['output_lr'],
                                 lr_alpha=lr_alpha)

        def ref(lr, grad, effgrad):
            flattened_grad = grad.flatten()
            flattened_effgrad = effgrad.flatten()
            x = np.dot(flattened_grad, flattened_effgrad)
            kEps = 1e-12
            y = np.linalg.norm(flattened_grad, ord=2)
            y = np.maximum(y, kEps)
            z = np.linalg.norm(flattened_effgrad, ord=2)
            z = np.maximum(z, kEps)
            output_lr = lr
            output_lr[0] -= lr[0] * lr_alpha * float(x / (y * z))
            return output_lr,

        self.assertReferenceChecks(gc, op, [lr, grad, effgrad], ref)

    @given(inputs=hu.tensors(n=2),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           lr_alpha=st.floats(min_value=0.01,
                              max_value=0.99,
                              allow_nan=False,
                              allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_learning_rate_adaption_op_without_normalization(
            self, inputs, lr, lr_alpha, gc, dc):
        grad, effgrad = inputs
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator('LearningRateAdaption',
                                 ['lr', 'grad', 'effgrad'], ['output_lr'],
                                 lr_alpha=lr_alpha,
                                 normalized_lr_adaption=False)

        def ref(lr, grad, effgrad):
            flattened_grad = grad.flatten()
            flattened_effgrad = effgrad.flatten()
            x = np.dot(flattened_grad, flattened_effgrad)
            output_lr = lr
            output_lr[0] -= lr_alpha * x
            return output_lr,

        self.assertReferenceChecks(gc, op, [lr, grad, effgrad], ref)
Beispiel #27
0
class PythonOpTest(hu.HypothesisTestCase):
    @given(x=hu.tensor())
    def test_feed(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(type(inputs[0].shape), tuple)
            self.assertEqual(type(inputs[0].data), np.ndarray)
            np.testing.assert_almost_equal(x, inputs[0].data)
        op = CreatePythonOperator(f, ["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    def test_exception(self):
        op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], [])
        with self.assertRaisesRegexp(
            RuntimeError, "This is an intentional exception."
        ):
            workspace.RunOperatorOnce(op)

    @given(x=hu.tensor())
    def test_feed_with_helper_function(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(type(inputs[0].shape), tuple)
            self.assertEqual(type(inputs[0].data), np.ndarray)
            np.testing.assert_almost_equal(x, inputs[0].data)
        net = core.Net("test")
        net.Python(f)(["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunNetOnce(net)

    def test_builder_tuple(self):
        net = core.Net("builder_template")
        iter_blob = 'iter'
        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
        for repeat in range(2):
            # check that the builder will be called exactly once for each
            # PythonOp constructor. Cloning the net will also trigger a call
            # to the builder when the net is created.
            cloned_net = net.Clone('builder_%d' % repeat)
            workspace.FeedBlob(iter_blob, np.array([0]))
            # Builder gets called once per python op in the line below
            workspace.CreateNet(cloned_net)
            for i in range(10):
                workspace.FeedBlob(iter_blob, np.array([i]))
                workspace.RunNet(cloned_net)

    @given(x=hu.tensor())
    def test_feed_with_gc(self, x):
        def f(inputs, _):
            self.assertEqual(x.shape, inputs[0].shape)
            np.testing.assert_almost_equal(x, inputs[0].data)
        op = CreatePythonOperator(f, ["x"], [])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)
        del f
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    @given(x=hu.tensor())
    def test_reshape(self, x):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            self.assertEqual(x.shape, inputs[0].shape)
            self.assertEqual(x.shape, outputs[0].shape)
            outputs[0].data[...] = inputs[0].data

        op = CreatePythonOperator(f, ["x"], ["y"])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)
        y = workspace.FetchBlob("y")
        np.testing.assert_almost_equal(x, y)

    @given(x=hu.tensor())
    def test_workspace_manipulation(self, x):
        """
        Verify that python op can manipulate workspace directly
        """
        def f(inputs, outputs, ws):
            fetched = ws.blobs['internal'].fetch()
            np.testing.assert_almost_equal(fetched, x)

        ws = workspace.C.Workspace()
        net = core.Net("test")
        net.GivenTensorFill([], ['internal'], values=x, shape=x.shape)
        net.Python(f, pass_workspace=True)([], [])
        ws.run(net)

    @given(x=hu.tensor())
    def test_caught_exception_doesnt_terminate(self, x):
        def f(inputs, outputs):
            try:
                raise Exception("Exception in handler")
            except Exception:
                pass

        op = CreatePythonOperator(f, ["x"], ["y"])
        workspace.FeedBlob("x", x)
        workspace.RunOperatorOnce(op)

    @given(x=hu.tensor(),
           n=st.integers(min_value=1, max_value=20),
           w=st.integers(min_value=1, max_value=20))
    def test_multithreaded_evaluation(self, x, n, w):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            outputs[0].data[...] = inputs[0].data
        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
        net = core.Net("net")
        net.Proto().op.extend(ops)
        net.Proto().type = "dag"
        net.Proto().num_workers = w
        iters = 100
        plan = core.Plan("plan")
        plan.AddStep(core.ExecutionStep("test-step", net, iters))
        workspace.FeedBlob("x", x)
        workspace.RunPlan(plan.Proto().SerializeToString())
        for i in range(n):
            y = workspace.FetchBlob(str(i))
            np.testing.assert_almost_equal(x, y)

    @given(x=hu.tensor(), in_place=st.booleans(), **hu.gcs)
    def test_gradient(self, x, in_place, gc, dc):
        def f(inputs, outputs):
            outputs[0].reshape(inputs[0].shape)
            outputs[0].data[...] = inputs[0].data * 2

        def grad_f(inputs, outputs):
            # Ordering is [inputs, outputs, grad_outputs]
            grad_output = inputs[2]

            grad_input = outputs[0]
            grad_input.reshape(grad_output.shape)
            grad_input.data[...] = grad_output.data * 2

        op = CreatePythonOperator(
            f, ["x"], ["x" if in_place else "y"], grad_f=grad_f)
        self.assertGradientChecks(gc, op, [x], 0, [0])
        self.assertDeviceChecks(dc, op, [x], [0])

    @given(inputs=hu.tensors(n=2), **hu.gcs)
    def test_gradient_multiple(self, inputs, gc, dc):
        (x1, x2) = inputs

        def f(inputs, outputs):
            for idx in [0, 1]:
                self.assertEqual(type(inputs[idx].shape), tuple)
                outputs[idx].reshape(inputs[idx].shape)
                outputs[idx].data[...] = inputs[idx].data * 2

        def grad_f(inputs, outputs):
            # Ordering is [inputs, outputs, grad_outputs]
            self.assertEqual(len(inputs), 6)
            self.assertEqual(len(outputs), 2)
            for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]:
                grad_output = inputs[grad_output_idx]
                grad_input = outputs[grad_input_idx]
                grad_input.reshape(grad_output.shape)
                grad_input.data[...] = grad_output.data * 2

        op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f)

        for idx in [0, 1]:
            self.assertGradientChecks(gc, op, [x1, x2], idx, [0, 1])
        self.assertDeviceChecks(dc, op, [x1, x2], [0, 1])

    @given(inputs=hu.tensors(n=3), **hu.gcs)
    def test_gradient_multiple_with_indices(self, inputs, gc, dc):
        (x1, x2, x3) = inputs

        def f(inputs, outputs):
            for idx in [0, 1, 2]:
                self.assertEqual(type(inputs[idx].shape), tuple)
                outputs[idx].reshape(inputs[idx].shape)
                outputs[idx].data[...] = inputs[idx].data * 2

        def grad_f(inputs, outputs):
            # Ordering is [inputs, outputs, grad_outputs]
            self.assertEqual(len(inputs), 8)
            self.assertEqual(len(outputs), 1)
            for (grad_output_idx, grad_input_idx) in [(6, 0)]:
                grad_output = inputs[grad_output_idx]
                grad_input = outputs[grad_input_idx]
                grad_input.reshape(grad_output.shape)
                grad_input.data[...] = grad_output.data * 2

        op = CreatePythonOperator(
            f, ["x1", "x2", "x3"], ["y1", "y2", "y3"],
            grad_f=grad_f,
            grad_output_indices=[0, 2],  # Receive grad outputs for y1 and y3
            grad_input_indices=[0]       # Produce grad inputs for x1
        )

        self.assertGradientChecks(gc, op, [x1, x2, x3], 0, [0, 2])
        self.assertDeviceChecks(dc, op, [x1, x2, x3], [0, 1, 2])
class TestAdadelta(serial.SerializedTestCase):
    @staticmethod
    def ref_adadelta(param_in,
                     mom_in,
                     mom_delta_in,
                     grad,
                     lr,
                     epsilon,
                     decay,
                     using_fp16=False):
        param_in_f32 = param_in
        mom_in_f32 = mom_in
        mom_delta_in_f32 = mom_delta_in
        if (using_fp16):
            param_in_f32 = param_in.astype(np.float32)
            mom_in_f32 = mom_in.astype(np.float32)
            mom_delta_in_f32 = mom_delta_in.astype(np.float32)

        mom_out = decay * mom_in_f32 + (1.0 - decay) * grad * grad
        new_grad = (np.sqrt(mom_delta_in_f32 + epsilon) /
                    np.sqrt(mom_out + epsilon)) * grad
        param_out = param_in_f32 + lr * new_grad
        mom_delta_out = decay * mom_delta_in_f32 + (
            1.0 - decay) * new_grad * new_grad
        if (using_fp16):
            return (param_out.astype(np.float16), mom_out.astype(np.float16),
                    mom_delta_out.astype(np.float16))
        else:
            return (param_out.astype(np.float32), mom_out.astype(np.float32),
                    mom_delta_out.astype(np.float32))

    @given(inputs=hu.tensors(n=4),
           lr=hu.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=hu.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           decay=hu.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           **hu.gcs)
    @settings(deadline=1000)
    def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
        param, moment, moment_delta, grad = inputs
        moment = np.abs(moment)
        moment_delta = np.abs(moment_delta)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Adadelta",
            ["param", "moment", "moment_delta", "grad", "lr"],
            ["param", "moment", "moment_delta"],
            epsilon=epsilon,
            decay=decay,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op, [param, moment, moment_delta, grad, lr],
            functools.partial(self.ref_adadelta, epsilon=epsilon, decay=decay))

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much],
              deadline=10000)
    @given(inputs=hu.tensors(n=4),
           lr=hu.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=hu.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           decay=hu.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           **hu.gcs)
    def test_sparse_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
        param, moment, moment_delta, grad = inputs
        moment = np.abs(moment)
        moment_delta = np.abs(moment_delta)
        lr = np.array([lr], dtype=np.float32)

        # Create an indexing array containing values that are lists of indices,
        # which index into grad
        indices = np.random.choice(np.arange(grad.shape[0]),
                                   size=np.random.randint(grad.shape[0]),
                                   replace=False)

        # Sparsify grad
        grad = grad[indices]

        op = core.CreateOperator(
            "SparseAdadelta",
            ["param", "moment", "moment_delta", "indices", "grad", "lr"],
            ["param", "moment", "moment_delta"],
            epsilon=epsilon,
            decay=decay,
            device_option=gc)

        def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay,
                       ref_using_fp16):
            param_out = np.copy(param)
            moment_out = np.copy(moment)
            moment_delta_out = np.copy(moment_delta)
            for i, index in enumerate(indices):
                param_out[index], moment_out[index], moment_delta_out[
                    index] = self.ref_adadelta(param[index], moment[index],
                                               moment_delta[index], grad[i],
                                               lr, epsilon, decay,
                                               ref_using_fp16)
            return (param_out, moment_out, moment_delta_out)

        ref_using_fp16_values = [False]
        if gc == hu.gpu_do:
            ref_using_fp16_values.append(True)

        for ref_using_fp16 in ref_using_fp16_values:
            moment_i = None
            moment_delta_i = None
            param_i = None
            if (ref_using_fp16):
                moment_i = moment.astype(np.float16)
                moment_delta_i = moment_delta.astype(np.float16)
                param_i = param.astype(np.float16)
            else:
                moment_i = moment.astype(np.float32)
                moment_delta_i = moment_delta.astype(np.float32)
                param_i = param.astype(np.float32)

            self.assertReferenceChecks(gc, op, [
                param_i, moment_i, moment_delta_i, indices, grad, lr, decay,
                ref_using_fp16
            ], ref_sparse)

    @given(inputs=hu.tensors(n=3),
           lr=st.floats(min_value=0.01,
                        max_value=0.99,
                        allow_nan=False,
                        allow_infinity=False),
           epsilon=st.floats(min_value=0.01,
                             max_value=0.99,
                             allow_nan=False,
                             allow_infinity=False),
           decay=st.floats(min_value=0.01,
                           max_value=0.99,
                           allow_nan=False,
                           allow_infinity=False),
           **hu.gcs)
    @settings(deadline=1000)
    def test_sparse_adadelta_empty(self, inputs, lr, epsilon, decay, gc, dc):
        param, moment, moment_delta = inputs
        moment = np.abs(moment)
        lr = np.array([lr], dtype=np.float32)

        grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0, ), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "SparseAdadelta",
            ["param", "moment", "moment_delta", "indices", "grad", "lr"],
            ["param", "moment", "moment_delta"],
            epsilon=epsilon,
            decay=decay,
            device_option=gc)

        def ref_sparse_empty(param, moment, moment_delta, indices, grad, lr,
                             decay):
            param_out = np.copy(param)
            moment_out = np.copy(moment)
            moment_delta_out = np.copy(moment_delta)
            return (param_out, moment_out, moment_delta_out)

        ref_using_fp16_values = [False]
        if gc == hu.gpu_do:
            ref_using_fp16_values.append(True)

        for ref_using_fp16 in ref_using_fp16_values:
            moment_i = None
            moment_delta_i = None
            param_i = None
            if (ref_using_fp16):
                moment_i = moment.astype(np.float16)
                moment_delta_i = moment_delta.astype(np.float16)
                param_i = param.astype(np.float16)
            else:
                moment_i = moment.astype(np.float32)
                moment_delta_i = moment_delta.astype(np.float32)
                param_i = param.astype(np.float32)

            self.assertReferenceChecks(
                gc, op,
                [param_i, moment_i, moment_delta_i, indices, grad, lr, decay],
                ref_sparse_empty)
class TestAdamOps(hu.HypothesisTestCase):
    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **mu.gcs)
    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)
        mom2 = np.absolute(mom2)
        op = core.CreateOperator(
            "Adam",
            ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)
        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do}

        self.assertDeviceChecks(
            dc, op,
            [param, mom1, mom2, grad, LR, ITER],
            [0],
            input_device_options=input_device_options,
            threshold=0.001)

    @given(inputs=hu.tensors(n=4),
           ITER=st.integers(min_value=0, max_value=10000),
           LR=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           beta1=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           beta2=st.floats(min_value=0.01, max_value=0.99,
                           allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **mu.gcs)
    def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
        param, mom1, mom2, grad = inputs
        ITER = np.array([ITER], dtype=np.int64)
        LR = np.array([LR], dtype=np.float32)
        mom2 = np.absolute(mom2)

        op = core.CreateOperator(
            "Adam",
            ["param", "mom1", "mom2", "grad", "lr", "iter"],
            ["output_param", "output_mom1", "output_mom2", "output_grad"],
            beta1=beta1, beta2=beta2, epsilon=epsilon)

        # Iter lives on the CPU
        input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do}

        self.assertDeviceChecks(
            dc, op,
            [param, mom1, mom2, grad, LR, ITER],
            [0],
            input_device_options=input_device_options,
            threshold=0.001)
class TestWngrad(serial.SerializedTestCase):
    @serial.given(inputs=hu.tensors(n=2),
           seq_b=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           lr=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_wngrad_dense_base(self, inputs, seq_b, lr, epsilon, gc, dc):
        param, grad = inputs
        seq_b = np.array([seq_b, ], dtype=np.float32)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Wngrad",
            ["param", "seq_b", "grad", "lr"],
            ["param", "seq_b"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op,
            [param, seq_b, grad, lr],
            functools.partial(ref_wngrad, epsilon=epsilon))

    @given(inputs=hu.tensors(n=2),
           seq_b=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           lr=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_wngrad_dense_output_effective_lr(self, inputs, seq_b,
                                              lr, epsilon, gc, dc):
        param, grad = inputs
        seq_b = np.array([seq_b, ], dtype=np.float32)
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Wngrad",
            ["param", "seq_b", "grad", "lr"],
            ["param", "seq_b", "effective_lr"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op,
            [param, seq_b, grad, lr],
            functools.partial(ref_wngrad, epsilon=epsilon,
                              output_effective_lr=True))

    @given(inputs=hu.tensors(n=2),
           seq_b=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           lr=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_wngrad_dense_output_effective_lr_and_update(
            self, inputs, seq_b, lr, epsilon, gc, dc):
        param, grad = inputs
        seq_b = np.abs(np.array([seq_b, ], dtype=np.float32))
        lr = np.array([lr], dtype=np.float32)

        op = core.CreateOperator(
            "Wngrad",
            ["param", "seq_b", "grad", "lr"],
            ["param", "seq_b", "effective_lr", "update"],
            epsilon=epsilon,
            device_option=gc,
        )

        self.assertReferenceChecks(
            gc, op,
            [param, seq_b, grad, lr],
            functools.partial(ref_wngrad, epsilon=epsilon,
                              output_effective_lr_and_update=True))

    # Suppress filter_too_much health check.
    # Likely caused by `assume` call falling through too often.
    @settings(suppress_health_check=[HealthCheck.filter_too_much])
    @given(inputs=hu.tensors(n=2),
           seq_b=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           lr=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           **hu.gcs_cpu_only)
    def test_sparse_wngrad(self, inputs, seq_b, lr, epsilon, gc, dc):
        return wngrad_sparse_test_helper(self, inputs, seq_b, lr, epsilon,
            None, gc, dc)

    @serial.given(inputs=hu.tensors(n=1),
           lr=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           seq_b=st.floats(min_value=0.01, max_value=0.99,
                        allow_nan=False, allow_infinity=False),
           epsilon=st.floats(min_value=0.01, max_value=0.99,
                             allow_nan=False, allow_infinity=False),
           data_strategy=st.data(),
           **hu.gcs_cpu_only)
    def test_sparse_wngrad_empty(self, inputs, seq_b, lr, epsilon,
                                  data_strategy, gc, dc):
        param = inputs[0]
        seq_b = np.array([seq_b, ], dtype=np.float32)
        lr = np.array([lr], dtype=np.float32)

        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
        indices = np.empty(shape=(0,), dtype=np.int64)

        hypothesis.note('indices.shape: %s' % str(indices.shape))

        op = core.CreateOperator(
            "SparseWngrad",
            ["param", "seq_b", "indices", "grad", "lr"],
            ["param", "seq_b"],
            epsilon=epsilon,
            device_option=gc)

        def ref_sparse(param, seq_b, indices, grad, lr):
            param_out = np.copy(param)
            seq_b_out = np.copy(seq_b)
            return (param_out, seq_b_out)

        print('test_sparse_adagrad_empty with full precision embedding')
        seq_b_i = seq_b.astype(np.float32)
        param_i = param.astype(np.float32)

        self.assertReferenceChecks(
            gc, op, [param_i, seq_b_i, indices, grad, lr], ref_sparse
        )