Beispiel #1
0
    def test_add_scalar_with_different_scalar_type(self, device):
        # int tensor with float scalar
        # should go 'slow' route
        scalar = 1.1
        tensors = [torch.tensor([1], dtype=torch.int, device=device)]
        res = torch._foreach_add(tensors, scalar)
        self.assertEqual(res, [torch.tensor([2.1], device=device)])

        # float tensor with int scalar
        # should go 'fast' route
        scalar = 1
        tensors = [torch.tensor([1.1], device=device)]
        res = torch._foreach_add(tensors, scalar)
        self.assertEqual(res, [torch.tensor([2.1], device=device)])

        # bool tensor with int scalar
        # should go 'slow' route
        scalar = 1
        tensors = [torch.tensor([False], device=device)]
        res = torch._foreach_add(tensors, scalar)
        self.assertEqual(res, [torch.tensor([1], device=device)])

        # bool tensor with float scalar
        # should go 'slow' route
        scalar = 1.1
        tensors = [torch.tensor([False], device=device)]
        res = torch._foreach_add(tensors, scalar)
        self.assertEqual(res, [torch.tensor([1.1], device=device)])
Beispiel #2
0
def adadelta(params: List[Tensor], grads: List[Tensor],
             square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float,
             weight_decay: float, rho: float, eps: float):
    r"""Functional API that performs Adadelta algorithm computation.

    See :class:`~torch.optim.Adadelta` for details.
    """

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, rho)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

    std = torch._foreach_add(square_avgs, eps)
    torch._foreach_sqrt_(std)

    deltas = torch._foreach_add(acc_deltas, eps)
    torch._foreach_sqrt_(deltas)
    torch._foreach_div_(deltas, std)
    torch._foreach_mul_(deltas, grads)

    torch._foreach_add_(params, deltas, alpha=-lr)

    torch._foreach_mul_(acc_deltas, rho)
    torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
Beispiel #3
0
def _multi_tensor_adadelta(params: List[Tensor],
                           grads: List[Tensor],
                           square_avgs: List[Tensor],
                           acc_deltas: List[Tensor],
                           *,
                           lr: float,
                           weight_decay: float,
                           rho: float,
                           eps: float,
                           maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, rho)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

    std = torch._foreach_add(square_avgs, eps)
    torch._foreach_sqrt_(std)

    deltas = torch._foreach_add(acc_deltas, eps)
    torch._foreach_sqrt_(deltas)
    torch._foreach_div_(deltas, std)
    torch._foreach_mul_(deltas, grads)

    torch._foreach_add_(params, deltas, alpha=-lr)

    torch._foreach_mul_(acc_deltas, rho)
    torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
Beispiel #4
0
    def test_complex_scalar(self, device, dtype):
        tensors = [
            torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)
        ]
        complex_scalar = 3 + 5j

        # bool tensor + 1 will result in int64 tensor
        expected = [
            torch.add(complex_scalar,
                      torch.zeros(10, 10, device=device, dtype=dtype))
            for _ in range(10)
        ]

        if dtype in [
                torch.float16, torch.float32, torch.float64, torch.bfloat16
        ] and device == 'cuda:0':
            # value cannot be converted to dtype without overflow:
            self.assertRaises(
                RuntimeError,
                lambda: torch._foreach_add_(tensors, complex_scalar))
            self.assertRaises(
                RuntimeError,
                lambda: torch._foreach_add(tensors, complex_scalar))
            return

        res = torch._foreach_add(tensors, complex_scalar)
        self.assertEqual(res, expected)

        if dtype not in [torch.complex64, torch.complex128]:
            self.assertRaises(
                RuntimeError,
                lambda: torch._foreach_add_(tensors, complex_scalar))
        else:
            torch._foreach_add_(tensors, complex_scalar)
            self.assertEqual(res, tensors)
Beispiel #5
0
def _multi_tensor_adam(params: List[Tensor],
                       grads: List[Tensor],
                       exp_avgs: List[Tensor],
                       exp_avg_sqs: List[Tensor],
                       max_exp_avg_sqs: List[Tensor],
                       state_steps: List[Tensor],
                       *,
                       amsgrad: bool,
                       beta1: float,
                       beta2: float,
                       lr: float,
                       weight_decay: float,
                       eps: float,
                       maximize: bool):

    if len(params) == 0:
        return

    # update steps
    torch._foreach_add_(state_steps, 1)

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
    bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    if amsgrad:
        # Maintains the maximum of all 2nd moment running avg. till now
        max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]

        # Use the max. for normalizing running avg. of gradient
        max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
        torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
        denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
    else:
        exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
        torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
        denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

    step_size = [(lr / bc) * -1 for bc in bias_correction1]
    torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
Beispiel #6
0
    def test_int_scalar(self, device, dtype):
        tensors = [
            torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)
        ]
        int_scalar = 1

        # bool tensor + 1 will result in int64 tensor
        if dtype == torch.bool:
            expected = [
                torch.ones(10, 10, device=device, dtype=torch.int64)
                for _ in range(10)
            ]
        else:
            expected = [
                torch.ones(10, 10, device=device, dtype=dtype)
                for _ in range(10)
            ]

        res = torch._foreach_add(tensors, int_scalar)
        self.assertEqual(res, expected)

        if dtype in [torch.bool]:
            with self.assertRaisesRegex(
                    RuntimeError,
                    "result type Long can't be cast to the desired output type Bool"
            ):
                torch._foreach_add_(tensors, int_scalar)
        else:
            torch._foreach_add_(tensors, int_scalar)
            self.assertEqual(res, tensors)
Beispiel #7
0
    def test_float_scalar(self, device, dtype):
        tensors = [
            torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)
        ]
        float_scalar = 1.

        # float scalar + integral tensor will result in float tensor
        if dtype in [
                torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64,
                torch.bool
        ]:
            expected = [
                torch.ones(10, 10, device=device, dtype=torch.float32)
                for _ in range(10)
            ]
        else:
            expected = [
                torch.ones(10, 10, device=device, dtype=dtype)
                for _ in range(10)
            ]

        res = torch._foreach_add(tensors, float_scalar)
        self.assertEqual(res, expected)

        if dtype in [
                torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64,
                torch.bool
        ]:
            self.assertRaises(
                RuntimeError,
                lambda: torch._foreach_add_(tensors, float_scalar))
        else:
            torch._foreach_add_(tensors, float_scalar)
            self.assertEqual(res, tensors)
Beispiel #8
0
    def test_add_scalar_with_different_size_tensors(self, device, dtype):
        N = 20
        H = 20
        W = 20

        tensors = []
        size_change = 0
        for _ in range(N):
            tensors.append(
                torch.zeros(H + size_change,
                            W + size_change,
                            device=device,
                            dtype=dtype))
            size_change += 1

        res = torch._foreach_add(tensors, 1)

        size_change = 0
        for t in res:
            if dtype == torch.bool:
                dtype = torch.int64
            self.assertEqual(
                t,
                torch.ones(H + size_change,
                           W + size_change,
                           device=device,
                           dtype=dtype))
            size_change += 1
Beispiel #9
0
def _multi_tensor_sgd(params: List[Tensor],
                      grads: List[Tensor],
                      momentum_buffer_list: List[Optional[Tensor]],
                      *,
                      weight_decay: float,
                      momentum: float,
                      lr: float,
                      dampening: float,
                      nesterov: bool,
                      maximize: bool,
                      has_sparse_grad: bool):

    if len(params) == 0:
        return

    if has_sparse_grad is None:
        has_sparse_grad = any(grad.is_sparse for grad in grads)

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    if weight_decay != 0:
        grads = torch._foreach_add(grads, params, alpha=weight_decay)

    if momentum != 0:
        bufs = []

        all_states_with_momentum_buffer = True
        for i in range(len(momentum_buffer_list)):
            if momentum_buffer_list[i] is None:
                all_states_with_momentum_buffer = False
                break
            else:
                bufs.append(momentum_buffer_list[i])

        if all_states_with_momentum_buffer:
            torch._foreach_mul_(bufs, momentum)
            torch._foreach_add_(bufs, grads, alpha=1 - dampening)
        else:
            bufs = []
            for i in range(len(momentum_buffer_list)):
                if momentum_buffer_list[i] is None:
                    buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach()
                else:
                    buf = momentum_buffer_list[i]
                    buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)

                bufs.append(buf)

        if nesterov:
            torch._foreach_add_(grads, bufs, alpha=momentum)
        else:
            grads = bufs

    if not has_sparse_grad:
        torch._foreach_add_(params, grads, alpha=-lr)
    else:
        # foreach APIs dont support sparse
        for i in range(len(params)):
            params[i].add_(grads[i], alpha=-lr)
Beispiel #10
0
    def test_add_list_slow_path(self, device, dtype):
        # different strides
        tensor1 = torch.zeros(10, 10, device=device, dtype=dtype)
        tensor2 = torch.ones(10, 10, device=device, dtype=dtype)
        res = torch._foreach_add([tensor1], [tensor2.t()])
        torch._foreach_add_([tensor1], [tensor2])
        self.assertEqual(res, [tensor1])

        # non contiguous
        tensor1 = torch.randn(5, 2, 1, 3, device=device)[:, 0]
        tensor2 = torch.randn(5, 2, 1, 3, device=device)[:, 0]
        self.assertFalse(tensor1.is_contiguous())
        self.assertFalse(tensor2.is_contiguous())
        res = torch._foreach_add([tensor1], [tensor2])
        torch._foreach_add_([tensor1], [tensor2])
        self.assertEqual(res, [tensor1])
Beispiel #11
0
    def test_add_list_different_sizes(self, device, dtype):
        tensors1 = [torch.zeros(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)]
        tensors2 = [torch.ones(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)]

        res = torch._foreach_add(tensors1, tensors2)
        torch._foreach_add_(tensors1, tensors2)
        self.assertEqual(res, tensors1)
        self.assertEqual(res, [torch.ones(10 + n, 10 + n, device=device, dtype=dtype) for n in range(10)])
Beispiel #12
0
    def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
        # TODO: enable empty list case
        for tensors in [[torch.randn([0])]]:
            res = torch._foreach_add(tensors, 1)
            self.assertEqual(res, tensors)

            torch._foreach_add_(tensors, 1)
            self.assertEqual(res, tensors)
Beispiel #13
0
def adagrad(params: List[Tensor], grads: List[Tensor],
            state_sums: List[Tensor], state_steps: List[int],
            has_sparse_grad: bool, *, lr: float, weight_decay: float,
            lr_decay: float, eps: float):
    r"""Functional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    """

    if weight_decay != 0:
        if has_sparse_grad:
            raise RuntimeError(
                "weight_decay option is not compatible with sparse gradients")
        torch._foreach_add_(grads, params, alpha=weight_decay)

    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]

    if has_sparse_grad:
        # sparse is not supported by multi_tensor. Fall back to optim.adagrad
        # implementation for sparse gradients
        for i, (param, grad, state_sum,
                step) in enumerate(zip(params, grads, state_sums,
                                       state_steps)):
            grad = grad.coalesce(
            )  # the update is non-linear so indices must be unique
            grad_indices = grad._indices()
            grad_values = grad._values()
            size = grad.size()

            state_sum.add_(_make_sparse(grad, grad_indices,
                                        grad_values.pow(2)))
            std_sparse = state_sum.sparse_mask(grad)
            std_sparse_values = std_sparse._values().sqrt_().add_(eps)
            param.add_(
                _make_sparse(grad, grad_indices,
                             grad_values / std_sparse_values),
                alpha=minus_clr[i],
            )
    else:
        grads = [
            torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
        ]
        state_sums = [
            torch.view_as_real(x) if torch.is_complex(x) else x
            for x in state_sums
        ]
        torch._foreach_addcmul_(state_sums, grads, grads, value=1)
        std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
        toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
        toAdd = [
            torch.view_as_complex(x) if torch.is_complex(params[i]) else x
            for i, x in enumerate(toAdd)
        ]
        torch._foreach_add_(params, toAdd)
        state_sums = [
            torch.view_as_complex(x) if torch.is_complex(params[i]) else x
            for i, x in enumerate(state_sums)
        ]
Beispiel #14
0
    def test_add_scalar_with_overlapping_tensors(self, device, dtype):
        tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)]
        expected = [torch.tensor([[[2, 2, 2]], [[2, 2, 2]]], dtype=dtype, device=device)]

        # bool tensor + 1 will result in int64 tensor
        if dtype == torch.bool:
            expected[0] = expected[0].to(torch.int64).add(1)

        res = torch._foreach_add(tensors, 1)
        self.assertEqual(res, expected)
Beispiel #15
0
    def test_bool_scalar(self, device, dtype):
        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
        bool_scalar = True

        expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]

        res = torch._foreach_add(tensors, bool_scalar)
        self.assertEqual(res, expected)

        torch._foreach_add_(tensors, bool_scalar)
        self.assertEqual(res, tensors)
Beispiel #16
0
    def test_add_list_same_size(self, device, dtype):
        tensors1 = [
            torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)
        ]
        tensors2 = [
            torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)
        ]

        res = torch._foreach_add(tensors1, tensors2)
        torch._foreach_add_(tensors1, tensors2)
        self.assertEqual(res, tensors1)
        self.assertEqual(res[0], torch.ones(10, 10, device=device,
                                            dtype=dtype))
Beispiel #17
0
    def test_add_scalar_with_different_tensor_dtypes(self, device):
        tensors = [
            torch.tensor([1], dtype=torch.float, device=device),
            torch.tensor([1], dtype=torch.int, device=device)
        ]

        expected = [
            torch.tensor([2], dtype=torch.float, device=device),
            torch.tensor([2], dtype=torch.int, device=device)
        ]

        res = torch._foreach_add(tensors, 1)
        self.assertEqual(res, expected)
Beispiel #18
0
    def test_add_scalar_with_same_size_tensors(self, device, dtype):
        N = 20
        H = 20
        W = 20
        tensors = []
        for _ in range(N):
            tensors.append(torch.zeros(H, W, device=device, dtype=dtype))

        res = torch._foreach_add(tensors, 1)
        for t in res:
            if dtype == torch.bool:
                dtype = torch.int64
            self.assertEqual(t, torch.ones(H, W, device=device, dtype=dtype))
Beispiel #19
0
def _multi_tensor_nadam(params: List[Tensor], grads: List[Tensor],
                        exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor],
                        mu_products: List[Tensor], state_steps: List[Tensor],
                        *, beta1: float, beta2: float, lr: float,
                        weight_decay: float, momentum_decay: float,
                        eps: float):

    if len(params) == 0:
        return

    # update steps
    torch._foreach_add_(state_steps, 1)

    bias_correction1 = [1 - beta1**step.item() for step in state_steps]
    bias_correction2 = [1 - beta2**step.item() for step in state_steps]
    mus = [
        beta1 * (1. - 0.5 * (0.96**(step.item() * momentum_decay)))
        for step in state_steps
    ]
    mu_nexts = [
        beta1 * (1. - 0.5 * (0.96**((step.item() + 1) * momentum_decay)))
        for step in state_steps
    ]

    # update mu_products
    torch._foreach_mul_(mu_products, mus)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    # Decay the first and second moment running average coefficient
    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
    bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
    torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
    denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

    step_size_grads = [(lr * (1. - mu) / (1. - mu_product.item())) * -1
                       for mu_product, mu in zip(mu_products, mus)]
    step_size_expavg = [
        (lr * mu_next / (1. - mu_product.item() * mu_next)) * -1
        for mu_product, mu_next in zip(mu_products, mu_nexts)
    ]
    torch._foreach_addcdiv_(params, grads, denom, step_size_grads)
    torch._foreach_addcdiv_(params, exp_avgs, denom, step_size_expavg)
Beispiel #20
0
def _multi_tensor_adagrad(params: List[Tensor], grads: List[Tensor],
                          state_sums: List[Tensor], state_steps: List[Tensor],
                          *, lr: float, weight_decay: float, lr_decay: float,
                          eps: float, has_sparse_grad: bool):

    # Foreach functions will throw errors if given empty lists
    if len(params) == 0:
        return

    if has_sparse_grad is None:
        has_sparse_grad = any([grad.is_sparse for grad in grads])

    if has_sparse_grad:
        return _single_tensor_adagrad(params,
                                      grads,
                                      state_sums,
                                      state_steps,
                                      lr=lr,
                                      weight_decay=weight_decay,
                                      lr_decay=lr_decay,
                                      eps=eps,
                                      has_sparse_grad=has_sparse_grad)

    # Update steps
    torch._foreach_add_(state_steps, 1)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]

    grads = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
    ]
    state_sums = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums
    ]
    torch._foreach_addcmul_(state_sums, grads, grads, value=1)
    std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
    toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
    toAdd = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(toAdd)
    ]
    torch._foreach_add_(params, toAdd)
    state_sums = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(state_sums)
    ]
Beispiel #21
0
def nadam(params: List[Tensor],
          grads: List[Tensor],
          exp_avg: List[Tensor],
          exp_avg_sq: List[Tensor],
          mu_products: List[Tensor],
          states: List[Dict],
          *,
          beta1: float,
          beta2: float,
          lr: float,
          weight_decay: float,
          momentum_decay: float,
          eps: float):
    r"""Functional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    """

    bias_correction1 = [1 - beta1 ** state['step'] for state in states]
    bias_correction2 = [1 - beta2 ** state['step'] for state in states]
    mus = [beta1 * (1. - 0.5 * (0.96 ** (state['step'] * momentum_decay))) for state in states]
    mu_nexts = [beta1 * (1. - 0.5 * (0.96 ** ((state['step'] + 1) * momentum_decay)))
                for state in states]
    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    # Decay the first and second moment running average coefficient
    torch._foreach_mul_(exp_avg, beta1)
    torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sq, beta2)
    torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)

    exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
    bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
    torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
    denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

    step_size_grads = [(lr * (1. - mu) / (1. - mu_product)) * -1
                       for mu_product, mu in zip(mu_products, mus)]
    step_size_expavg = [(lr * mu_next / (1. - mu_product * mu_next)) * -1
                        for mu_product, mu_next in zip(mu_products, mu_nexts)]
    torch._foreach_addcdiv_(params, grads, denom, step_size_grads)
    torch._foreach_addcdiv_(params, exp_avg, denom, step_size_expavg)
Beispiel #22
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            amsgrad = group['amsgrad']

            grads = []
            states = []
            exp_avg = []
            exp_avg_sq = []
            max_exp_avg_sq = []
            params_with_grad = []

            for p in group['params']:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            'Adam does not support sparse gradients, please consider SparseAdam instead'
                        )
                    params_with_grad.append(p)
                    grads.append(p.grad)

            for p in params_with_grad:
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(
                        p, memory_format=torch.preserve_format)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(
                        p, memory_format=torch.preserve_format)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)

                exp_avg.append(state['exp_avg'])
                exp_avg_sq.append(state['exp_avg_sq'])

                if amsgrad:
                    max_exp_avg_sq.append(state['max_exp_avg_sq'])

                state['step'] += 1
                states.append(state)

            beta1, beta2 = group['betas']

            bias_correction1 = [1 - beta1**state['step'] for state in states]
            bias_correction2 = [1 - beta2**state['step'] for state in states]
            if group['weight_decay'] != 0:
                grads = torch._foreach_add(grads,
                                           params_with_grad,
                                           alpha=group['weight_decay'])

            #
            # Decay the first and second moment running average coefficient
            #
            torch._foreach_mul_(exp_avg, beta1)
            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)

            torch._foreach_mul_(exp_avg_sq, beta2)
            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)

            if amsgrad:
                # Maintains the maximum of all 2nd moment running avg. till now
                max_exp_avg_sq = torch._foreach_maximum(
                    max_exp_avg_sq, exp_avg_sq)

                # Use the max. for normalizing running avg. of gradient
                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
                bias_correction_sqrt = [
                    math.sqrt(bc) for bc in bias_correction2
                ]
                torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
            else:
                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
                bias_correction_sqrt = [
                    math.sqrt(bc) for bc in bias_correction2
                ]
                torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])

            step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1]
            torch._foreach_addcdiv_(params_with_grad, exp_avg, denom,
                                    step_size)

        return loss
Beispiel #23
0
    def test_add_list_error_cases(self, device):
        tensors1 = []
        tensors2 = []

        # Empty lists
        with self.assertRaises(RuntimeError):
            torch._foreach_add(tensors1, tensors2)
        with self.assertRaises(RuntimeError):
            torch._foreach_add_(tensors1, tensors2)

        # One empty list
        tensors1.append(torch.tensor([1], device=device))
        with self.assertRaisesRegex(
                RuntimeError, "Tensor list must have at least one tensor."):
            torch._foreach_add(tensors1, tensors2)
        with self.assertRaisesRegex(
                RuntimeError, "Tensor list must have at least one tensor."):
            torch._foreach_add_(tensors1, tensors2)

        # Lists have different amount of tensors
        tensors2.append(torch.tensor([1], device=device))
        tensors2.append(torch.tensor([1], device=device))
        with self.assertRaisesRegex(
                RuntimeError,
                "Tensor lists must have the same number of tensors, got 1 and 2"
        ):
            torch._foreach_add(tensors1, tensors2)
        with self.assertRaisesRegex(
                RuntimeError,
                "Tensor lists must have the same number of tensors, got 1 and 2"
        ):
            torch._foreach_add_(tensors1, tensors2)

        # Different dtypes
        tensors1 = [
            torch.zeros(10, 10, device=device, dtype=torch.float)
            for _ in range(10)
        ]
        tensors2 = [
            torch.ones(10, 10, device=device, dtype=torch.int)
            for _ in range(10)
        ]

        with self.assertRaisesRegex(
                RuntimeError,
                "All tensors in the tensor list must have the same dtype."):
            torch._foreach_add(tensors1, tensors2)
        with self.assertRaisesRegex(
                RuntimeError,
                "All tensors in the tensor list must have the same dtype."):
            torch._foreach_add_(tensors1, tensors2)

        # different devices
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            tensor1 = torch.zeros(10, 10, device="cuda:0")
            tensor2 = torch.ones(10, 10, device="cuda:1")
            with self.assertRaisesRegex(
                    RuntimeError,
                    "Expected all tensors to be on the same device"):
                torch._foreach_add([tensor1], [tensor2])
            with self.assertRaisesRegex(
                    RuntimeError,
                    "Expected all tensors to be on the same device"):
                torch._foreach_add_([tensor1], [tensor2])

        # Coresponding tensors with different sizes
        tensors1 = [torch.zeros(10, 10, device=device) for _ in range(10)]
        tensors2 = [torch.ones(11, 11, device=device) for _ in range(10)]
        with self.assertRaisesRegex(
                RuntimeError,
                "Corresponding tensors in lists must have the same size"):
            torch._foreach_add(tensors1, tensors2)
        with self.assertRaisesRegex(RuntimeError,
                                    r", got \[10, 10\] and \[11, 11\]"):
            torch._foreach_add_(tensors1, tensors2)
Beispiel #24
0
 def test_bin_op_scalar_with_different_tensor_dtypes(self, device):
     tensors = [
         torch.tensor([1.1], dtype=torch.float, device=device),
         torch.tensor([1], dtype=torch.long, device=device)
     ]
     self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, 1))
Beispiel #25
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            amsgrad = group["amsgrad"]

            grads = []
            states = []
            exp_avg = []
            exp_avg_sq = []
            max_exp_avg_sq = []
            params_with_grad = []

            for p in group["params"]:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            "AdamW does not support sparse gradients")

                    # Perform stepweight decay
                    p.mul_(1 - group["lr"] * group["weight_decay"])

                    params_with_grad.append(p)
                    grads.append(p.grad)

            for p in params_with_grad:
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format)
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.ones_like(
                        p, memory_format=torch.preserve_format
                    )  # torch init to zeros
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state["max_exp_avg_sq"] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)

                exp_avg.append(state["exp_avg"])
                exp_avg_sq.append(state["exp_avg_sq"])

                if amsgrad:
                    max_exp_avg_sq.append(state["max_exp_avg_sq"])

                state["step"] += 1
                states.append(state)

            beta1, beta2 = group["betas"]

            bias_correction1 = [1 - beta1**state["step"] for state in states]
            bias_correction2 = [1 - beta2**state["step"] for state in states]

            #
            # Decay the first and second moment running average coefficient
            #
            torch._foreach_mul_(exp_avg, beta1)
            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)

            torch._foreach_mul_(exp_avg_sq, beta2)
            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)

            if amsgrad:
                # Maintains the maximum of all 2nd moment running avg. till now
                max_exp_avg_sq = torch._foreach_maximum(
                    max_exp_avg_sq, exp_avg_sq)

                # Use the max. for normalizing running avg. of gradient
                max_exp_avg_sq_sqrt = torch._foreach_sqrt(
                    torch._foreach_add(max_exp_avg_sq, group["eps"]))
                bias_correction_sqrt = [
                    math.sqrt(bc) for bc in bias_correction2
                ]
                denom = torch._foreach_div(max_exp_avg_sq_sqrt,
                                           bias_correction_sqrt)
            else:
                exp_avg_sq_sqrt = torch._foreach_sqrt(
                    torch._foreach_add(exp_avg_sq, group["eps"]))
                bias_correction_sqrt = [
                    math.sqrt(bc) for bc in bias_correction2
                ]
                denom = torch._foreach_div(exp_avg_sq_sqrt,
                                           bias_correction_sqrt)

            step_size = [-1 * (group["lr"] / bc) for bc in bias_correction1]
            torch._foreach_addcdiv_(params_with_grad, exp_avg, denom,
                                    step_size)

        return loss
Beispiel #26
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            grads = []
            params_with_grad = []
            states = []
            has_sparse_grad = False

            for p in group['params']:
                if p.grad is not None:
                    grads.append(p.grad)
                    params_with_grad.append(p)
                    states.append(self.state[p])

                    if p.grad.is_sparse:
                        has_sparse_grad = True

                        if momentum != 0:
                            raise RuntimeError(
                                'SGD does not support momentum for sparse gradients'
                            )

            if grads == []:
                return loss

            if weight_decay != 0:
                grads = torch._foreach_add(grads,
                                           params_with_grad,
                                           alpha=weight_decay)

            if momentum != 0:
                bufs = []

                all_states_with_momentum_buffer = True
                for i in range(len(states)):
                    if 'momentum_buffer' not in states[i]:
                        all_states_with_momentum_buffer = False
                        break
                    else:
                        bufs.append(states[i]['momentum_buffer'])

                if all_states_with_momentum_buffer:
                    torch._foreach_mul_(bufs, momentum)
                    torch._foreach_add_(bufs, grads, alpha=1 - dampening)
                else:
                    bufs = []
                    for i in range(len(states)):
                        if 'momentum_buffer' not in states[i]:
                            buf = states[i]['momentum_buffer'] = torch.clone(
                                grads[i]).detach()
                        else:
                            buf = states[i]['momentum_buffer']
                            buf.mul_(momentum).add_(grads[i],
                                                    alpha=1 - dampening)

                        bufs.append(buf)

                if nesterov:
                    torch._foreach_add_(grads, bufs, alpha=momentum)
                else:
                    grads = bufs

            if not has_sparse_grad:
                torch._foreach_add_(params_with_grad,
                                    grads,
                                    alpha=-group['lr'])
            else:
                # foreach APIs dont support sparse
                for i in range(len(params_with_grad)):
                    params_with_grad[i].add_(grads[i], alpha=-group['lr'])

        return loss
Beispiel #27
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            grads = []
            params_with_grad = []
            states = []
            square_avgs = []
            acc_deltas = []

            rho, eps = group['rho'], group['eps']

            for p in group['params']:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            'Adadelta does not support sparse gradients')

                    grads.append(p.grad)
                    params_with_grad.append(p)

                    state = self.state[p]

                    # State initialization
                    if len(state) == 0:
                        state['step'] = 0
                        state['square_avg'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)
                        state['acc_delta'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)

                    square_avgs.append(state['square_avg'])
                    acc_deltas.append(state['acc_delta'])

                    state['step'] += 1
                    states.append(state)

            if group['weight_decay'] != 0:
                torch._foreach_add_(grads,
                                    params_with_grad,
                                    alpha=group['weight_decay'])

            torch._foreach_mul_(square_avgs, rho)
            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

            std = torch._foreach_add(square_avgs, eps)
            torch._foreach_sqrt_(std)

            deltas = torch._foreach_add(acc_deltas, eps)
            torch._foreach_sqrt_(deltas)
            torch._foreach_div_(deltas, std)
            torch._foreach_mul_(deltas, grads)

            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])

            torch._foreach_mul_(acc_deltas, rho)
            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)

        return loss
Beispiel #28
0
 def test_add_scalar_with_empty_list(self, device, dtype):
     tensors = []
     with self.assertRaises(RuntimeError):
         torch._foreach_add(tensors, 1)
Beispiel #29
0
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor],
                        exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor],
                        max_exp_avg_sqs: List[Tensor],
                        state_steps: List[Tensor], *, amsgrad: bool,
                        beta1: float, beta2: float, lr: float,
                        weight_decay: float, eps: float, maximize: bool,
                        capturable: bool):
    if len(params) == 0:
        return

    if capturable:
        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
            "If capturable=True, params and state_steps must be CUDA tensors."

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    grads = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
    ]
    exp_avgs = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs
    ]
    exp_avg_sqs = [
        torch.view_as_real(x) if torch.is_complex(x) else x
        for x in exp_avg_sqs
    ]
    params = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in params
    ]

    # update steps
    torch._foreach_add_(state_steps, 1)

    # Perform stepweight decay
    torch._foreach_mul_(params, 1 - lr * weight_decay)

    # Decay the first and second moment running average coefficient
    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    if capturable:
        # TODO: use foreach_pow if/when foreach_pow is added
        bias_correction1 = [torch.pow(beta1, step) for step in state_steps]
        bias_correction2 = [torch.pow(beta2, step) for step in state_steps]
        # foreach_sub doesn't allow a scalar as the first arg
        torch._foreach_sub_(bias_correction1, 1)
        torch._foreach_sub_(bias_correction2, 1)
        torch._foreach_neg_(bias_correction1)
        torch._foreach_neg_(bias_correction2)

        # foreach_div doesn't allow a scalar as the first arg
        step_size = torch._foreach_div(bias_correction1, lr)
        torch._foreach_reciprocal_(step_size)
        torch._foreach_neg_(step_size)

        bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2)

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
            # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
            torch._foreach_div_(
                max_exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(
                exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size)

        torch._foreach_addcdiv_(params, exp_avgs, denom)
    else:
        bias_correction1 = [1 - beta1**step.item() for step in state_steps]
        bias_correction2 = [1 - beta2**step.item() for step in state_steps]

        step_size = [(lr / bc) * -1 for bc in bias_correction1]

        bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2]

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

        torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)