Beispiel #1
0
    def op(self, startEdge, fromVertex, toVertex):
        """The definition of the operator function.

        The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the
        separate lists.

        :param startEdge: Indices into toVertex where edges start.
        :type startEdge: list.
        :param fromVertex: The from-vertex of each edge.
        :type fromVertex: list.
        :param toVertex: The to-vertex of each edge.
        :type toVertex: list.
        :return Counts of triangles per edge.
        """
        iEdge           = ops.position_in(toVertex.shape)[0]
        count           = ops.output(toVertex.shape, ops.uint64)
        nTriangle       = ops.variable(0, ops.uint64)

        iFromVertex     = ops.variable(fromVertex[iEdge], fromVertex.dtype)
        iFromEdge       = ops.variable(startEdge[iFromVertex], startEdge.dtype)
        iFromEdgeEnd    = ops.variable(startEdge[iFromVertex+1], startEdge.dtype)
        iiFromVertex    = ops.variable(toVertex[iFromEdge], toVertex.dtype)

        iToVertex       = ops.variable(toVertex[iEdge], toVertex.dtype)
        iToEdge         = ops.variable(startEdge[iToVertex], startEdge.dtype)
        iToEdgeEnd      = ops.variable(startEdge[iToVertex+1], startEdge.dtype)
        iiToVertex      = ops.variable(toVertex[iToEdge], toVertex.dtype)

        nMerge          = iToEdgeEnd-iToEdge + iFromEdgeEnd-iFromEdge # Maximum number of merges.

        # This construction is a work-around for simulating the function of a while loop.
        #TODO([email protected]): Replace this construct by a while loop once it is available in ovl.
        for iMerge in ops.arange(nMerge):
            doMerge = ops.logical_and(iFromEdge < iFromEdgeEnd, iToEdge < iToEdgeEnd)
            doMerge = ops.logical_and(doMerge, iiFromVertex < iToVertex)

            with ops.if_(doMerge):

                with ops.if_(iiFromVertex < iiToVertex):
                    iFromEdge <<= iFromEdge+1
                    iiFromVertex <<= toVertex[iFromEdge]

                with ops.elif_(iiFromVertex > iiToVertex):
                    iToEdge <<= iToEdge+1
                    iiToVertex <<= toVertex[iToEdge]

                with ops.else_():
                    nTriangle <<= nTriangle+1
                    iFromEdge <<= iFromEdge+1
                    iToEdge <<= iToEdge+1
                    iiFromVertex <<= toVertex[iFromEdge]
                    iiToVertex <<= toVertex[iToEdge]


        #TODO([email protected]): Use a reduction function that computes a partial or complete sum.
        count[iEdge] = nTriangle # Save the triangles for each edge.

        return count
Beispiel #2
0
def expm1(x):
    """
    Define the expm1 operator by defining the its operator function to be

    .. math::
      out_{i} = exp(x_{i}) - 1.0

    :param x: The input tensor
    :return: Element-wise exp(x) - 1

    :Examples:

    .. doctest::

        >>> import numpy as np
        >>> from opveclib import evaluate
        >>> from opveclib.examples import expm1
        >>> a = np.array([1e-10, -1e-10])
        >>> evaluate(expm1(a))
        array([  1.00000000e-10,  -1.00000000e-10])
        >>> np.expm1(a)
        array([  1.00000000e-10,  -1.00000000e-10])
        >>> ones = np.ones_like(a)
        >>> np.exp(a) - ones
        array([  1.00000008e-10,  -1.00000008e-10])
    """
    output = ovl.output_like(x)
    pos = ovl.position_in(x.shape)
    e = ovl.exp(x[pos])

    # note, this is an example of the use of the OVL conditional operators
    with ovl.if_(ovl.logical_and(ovl.isinf(x[pos]), x[pos] > 0.0)):
        output[pos] = x[pos]
    with ovl.elif_(e == 1.0):
        output[pos] = x[pos]
    with ovl.elif_((e - 1.0) == -1.0):
        output[pos] = -1.0
    with ovl.else_():
        output[pos] = (e - 1.0) * x[pos] / ovl.log(e)
    return output
def expm1(x):
    """
    Define the expm1 operator by defining the its operator function to be

    .. math::
      out_{i} = exp(x_{i}) - 1.0

    :param x: The input tensor
    :return: Element-wise exp(x) - 1

    :Examples:

    .. doctest::

        >>> import numpy as np
        >>> from opveclib import evaluate
        >>> from opveclib.examples import expm1
        >>> a = np.array([1e-10, -1e-10])
        >>> evaluate(expm1(a))
        array([  1.00000000e-10,  -1.00000000e-10])
        >>> np.expm1(a)
        array([  1.00000000e-10,  -1.00000000e-10])
        >>> ones = np.ones_like(a)
        >>> np.exp(a) - ones
        array([  1.00000008e-10,  -1.00000008e-10])
    """
    output = ovl.output_like(x)
    pos = ovl.position_in(x.shape)
    e = ovl.exp(x[pos])

    # note, this is an example of the use of the OVL conditional operators
    with ovl.if_(ovl.logical_and(ovl.isinf(x[pos]), x[pos] > 0.0)):
        output[pos] = x[pos]
    with ovl.elif_(e == 1.0):
        output[pos] = x[pos]
    with ovl.elif_ ((e - 1.0) == -1.0):
        output[pos] = -1.0
    with ovl.else_():
        output[pos] = (e - 1.0) * x[pos] / ovl.log(e)
    return output
Beispiel #4
0
def log1p(x):
    """
    Define the log1p operator by defining the its operator function to be

    .. math::
      out_{i} = log(1.0 + x_{i})

    :param x: The input tensor
    :return: Element-wise log(1 + x)

    :Examples:

    .. doctest::

        >>> import numpy as np
        >>> from opveclib import evaluate
        >>> from opveclib.examples import log1p
        >>> a = np.array([1e-99, -1e-99])
        >>> evaluate(log1p(a))
        array([  1.00000000e-99,  -1.00000000e-99])
        >>> np.log1p(a)
        array([  1.00000000e-99,  -1.00000000e-99])
        >>> ones = np.ones_like(a)
        >>> np.log(ones + a)
        array([ 0.,  0.])
    """
    output = ovl.output_like(x)
    pos = ovl.position_in(x.shape)
    u = 1.0 + x[pos]
    d = u - 1.0

    # note, this is an example of the use of the OVL conditional operators
    with ovl.if_(ovl.logical_and(ovl.isinf(x[pos]), x[pos] > 0.0)):
        output[pos] = x[pos]
    with ovl.elif_(d == 0):
        output[pos] = x[pos]
    with ovl.else_():
        output[pos] = ovl.log(u) * x[pos] / d
    return output
Beispiel #5
0
def conv_1d(x,
            v,
            kernel_orientation='as-is',
            stride=1,
            mode='same',
            data_format='NCE'):
    """
    Define the operator function.

    :param x: An input tensor of shape [num_batches, num_channels, num_elements].
    :param v: A filter/kernel of shape [num_filters, num_channels, kernel_size].
    :param kernel_orientation: The orientation of the kernel to use: 'as-is' or 'flipped'. This language is used
        rather than 'convolution' or 'cross-correlation' since the terms have become overloaded and ambiguous across
        some fields. As defined in https://en.wikipedia.org/wiki/Cross-correlation#Properties, 'as-is' yields the
        cross-correlation and 'flipped' yields the convolution.
    :param stride: kernel stride to use.
    :param mode: border mode: 'same', 'valid', or 'full'
    :param data_format: order of the dimensions in the input: 'NCE', 'NEC' etc.
    :return: an output tensor of shape [num_batches, num_filters, num_elements]
    """

    if kernel_orientation != 'as-is' and kernel_orientation != 'flipped':
        raise ValueError("kernel_orientation must be 'as-is' or 'flipped'")

    # resolve data layout based on data_format input
    assert x.rank == 3
    assert len(data_format) == 3
    assert data_format.count('N') == 1
    assert data_format.count('C') == 1
    assert data_format.count('E') == 1

    n_axis = data_format.find('N')
    c_axis = data_format.find('C')
    e_axis = data_format.find('E')

    num_elements = x.shape[e_axis]
    num_channels = x.shape[c_axis]
    num_batches = x.shape[n_axis]

    assert v.rank == 3
    if num_channels != v.shape[c_axis]:
        raise ValueError(
            'Channel axis size of input must match that of the filter.')

    num_filters = v.shape[n_axis]
    filter_size = v.shape[e_axis]
    left_apron = filter_size // 2
    right_apron = filter_size - left_apron - 1

    if not isinstance(stride, int) or stride < 1 or stride > num_elements:
        raise ValueError('Stride must be a positive integer')

    if mode == 'same':
        if filter_size > num_elements:
            raise ValueError('filter size, ' + str(filter_size) +
                             ',  cannot be larger than number of elements, ' +
                             str(num_elements))

        starting_element = -left_apron
        ending_element = num_elements - left_apron
    elif mode == 'valid':
        if filter_size > num_elements:
            raise ValueError('filter size, ' + str(filter_size) +
                             ',  cannot be larger than number of elements, ' +
                             str(num_elements))

        starting_element = 0
        ending_element = num_elements - (left_apron + right_apron)
    elif mode == 'full':
        starting_element = -(filter_size - 1)
        ending_element = num_elements
    else:
        raise ValueError("mode must be 'same', 'valid', or 'full'.")

    output_elements = (ending_element - starting_element)

    output_shape = [0, 0, 0]
    output_shape[n_axis] = num_batches
    output_shape[c_axis] = num_filters
    output_shape[e_axis] = output_elements
    output = ovl.output(output_shape, x.dtype)

    filters_per_worker = 1
    filter_workers, filter_remainder = divmod(num_filters, filters_per_worker)
    if filter_remainder > 0:
        filter_workers += 1

    batches_per_worker = 1
    batch_workers, batch_remainder = divmod(num_batches, batches_per_worker)
    if batch_remainder > 0:
        batch_workers += 1

    elements_per_worker = 10
    element_workers, element_remainder = divmod(output_elements,
                                                elements_per_worker)
    if element_remainder > 0:
        element_workers += 1

    workgroup_shape = [batch_workers, filter_workers, element_workers]
    ovl.logger.debug(u'    workgroup_shape: ' + str(workgroup_shape))
    pos = ovl.position_in(workgroup_shape)
    cur_batch_block = pos[0]
    cur_filter_block = pos[1]
    cur_element_block = pos[2]

    num_block_batches = ovl.variable(batches_per_worker, ovl.uint32)
    if batch_remainder > 0:
        with ovl.if_(cur_batch_block == batch_workers - 1):
            num_block_batches <<= batch_remainder

    num_block_filters = ovl.variable(filters_per_worker, ovl.uint32)
    if filter_remainder > 0:
        with ovl.if_(cur_filter_block == filter_workers - 1):
            num_block_filters <<= filter_remainder

    num_block_elements = ovl.variable(elements_per_worker, ovl.uint32)
    if element_remainder > 0:
        with ovl.if_(cur_element_block == element_workers - 1):
            num_block_elements <<= element_remainder

    accum = ovl.zeros(
        (batches_per_worker, filters_per_worker, elements_per_worker),
        ovl.float64)  #4*4

    filter_block = ovl.zeros((filters_per_worker, filter_size), v.dtype)  #4*10
    input_block = ovl.zeros((batches_per_worker, filter_size), x.dtype)  #4*10
    for cur_channel in ovl.arange(num_channels):

        # load all filters for this channel
        for intra_block_filter in ovl.arange(filters_per_worker):
            for f_pos in ovl.arange(filter_size):
                filter_index = [None, None, None]
                filter_index[c_axis] = cur_channel
                filter_index[n_axis] = ovl.cast(
                    intra_block_filter,
                    ovl.uint32) + cur_filter_block * filters_per_worker
                if kernel_orientation == 'as-is':
                    filter_index[e_axis] = f_pos
                elif kernel_orientation == 'flipped':
                    filter_index[e_axis] = filter_size - f_pos - 1
                else:
                    raise ValueError(
                        "kernel_orientation must be 'as-is' or 'flipped'")
                filter_block[intra_block_filter, f_pos] = v[filter_index]

        # load initial inputs for this channel
        buffer_head = ovl.variable(0, ovl.uint32)
        for intra_block_batch in ovl.arange(num_block_batches):
            cur_batch = intra_block_batch + cur_batch_block * batches_per_worker
            for f_pos in ovl.arange(filter_size):
                x_index = [None, None, None]
                x_index[c_axis] = cur_channel
                x_index[n_axis] = cur_batch

                x_elem_index = starting_element + ovl.cast(
                    cur_element_block * elements_per_worker,
                    ovl.uint64) + ovl.cast(f_pos, ovl.uint64)
                x_index[e_axis] = x_elem_index
                index_in_bounds = ovl.logical_and(x_elem_index >= 0,
                                                  x_elem_index < num_elements)
                with ovl.if_(index_in_bounds):
                    input_block[intra_block_batch, f_pos] = x[x_index]
                with ovl.else_():
                    input_block[intra_block_batch, f_pos] = 0

        for intra_block_element in ovl.arange(num_block_elements):
            cur_elem = intra_block_element + cur_element_block * elements_per_worker
            for intra_block_batch in ovl.arange(num_block_batches):
                cur_batch = intra_block_batch + cur_batch_block * batches_per_worker
                for intra_block_filter in ovl.arange(num_block_filters):
                    for f_pos in ovl.arange(filter_size):
                        x_pos = (buffer_head +
                                 ovl.cast(f_pos, ovl.uint32)) % filter_size
                        cur_x = ovl.cast(input_block[intra_block_batch, x_pos],
                                         ovl.float64)
                        cur_v = ovl.cast(
                            filter_block[intra_block_filter, f_pos],
                            ovl.float64)
                        accum[intra_block_batch, intra_block_filter, intra_block_element] = \
                            accum[intra_block_batch, intra_block_filter, intra_block_element] + cur_x * cur_v

                # load new element
                x_index = [None, None, None]
                x_index[c_axis] = cur_channel
                x_index[n_axis] = cur_batch
                x_elem_index = starting_element + cur_elem + filter_size
                x_index[e_axis] = x_elem_index
                index_in_bounds = ovl.logical_and(x_elem_index >= 0,
                                                  x_elem_index < num_elements)
                with ovl.if_(index_in_bounds):
                    input_block[intra_block_batch, buffer_head] = x[x_index]
                with ovl.else_():
                    input_block[intra_block_batch, buffer_head] = 0

            buffer_head <<= (buffer_head + 1) % filter_size

    for intra_block_batch in ovl.arange(num_block_batches):
        cur_batch = intra_block_batch + cur_batch_block * batches_per_worker
        for intra_block_filter in ovl.arange(num_block_filters):
            cur_filter = intra_block_filter + cur_filter_block * filters_per_worker
            for intra_block_element in ovl.arange(num_block_elements):
                cur_elem = intra_block_element + cur_element_block * elements_per_worker

                output_index = [None, None, None]
                output_index[n_axis] = cur_batch
                output_index[e_axis] = cur_elem
                output_index[c_axis] = cur_filter
                output[output_index] = ovl.cast(
                    accum[intra_block_batch, intra_block_filter,
                          intra_block_element], output.dtype)

    return output
Beispiel #6
0
def graph_triangle_count(startEdge, fromVertex, toVertex):
    """Counts the triangles in an undirected graph.

    Notice that this method assumes that the graph is given as an adjacency list where all lists with vertex neighbors
    are sorted.

    The parallel algorithm uses the following strategy. We map one thread per edge, This is also called the edge-based
    iterator strategy.

    The idea behind the algorithm is:
        1. Go over all edges (u, v).
        2. The neighboring indices for vertex u are N(u) and for vertex v are N(v).
        3. Increment the triangle counter by | N(u) /\ N(v) | where /\ is the set intersection operator.

    We enforce an order on the vertices that avoids counting the same triangle three times, instead each triangle is
    counted once.

    Attributes: None.

    The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the
    separate lists.

    :param startEdge: Indices into toVertex where edges start.
    :type startEdge: list.
    :param fromVertex: The from-vertex of each edge.
    :type fromVertex: list.
    :param toVertex: The to-vertex of each edge.
    :type toVertex: list.
    :return: Counts of triangles per edge.
    """
    iEdge = ovl.position_in(toVertex.shape)[0]
    count = ovl.output(toVertex.shape, ovl.uint64)
    nTriangle = ovl.variable(0, ovl.uint64)

    iFromVertex = ovl.variable(fromVertex[iEdge], fromVertex.dtype)
    iFromEdge = ovl.variable(startEdge[iFromVertex], startEdge.dtype)
    iFromEdgeEnd = ovl.variable(startEdge[iFromVertex + 1], startEdge.dtype)
    iiFromVertex = ovl.variable(toVertex[iFromEdge], toVertex.dtype)

    iToVertex = ovl.variable(toVertex[iEdge], toVertex.dtype)
    iToEdge = ovl.variable(startEdge[iToVertex], startEdge.dtype)
    iToEdgeEnd = ovl.variable(startEdge[iToVertex + 1], startEdge.dtype)
    iiToVertex = ovl.variable(toVertex[iToEdge], toVertex.dtype)

    nMerge = iToEdgeEnd - iToEdge + iFromEdgeEnd - iFromEdge  # Maximum number of merges.

    # This construction is a work-around for simulating the function of a while loop.
    #TODO([email protected]): Replace this construct by a while loop once it is available in ovl.
    for iMerge in ovl.arange(nMerge):
        doMerge = ovl.logical_and(iFromEdge < iFromEdgeEnd,
                                  iToEdge < iToEdgeEnd)
        doMerge = ovl.logical_and(doMerge, iiFromVertex < iToVertex)

        with ovl.if_(doMerge):

            with ovl.if_(iiFromVertex < iiToVertex):
                iFromEdge <<= iFromEdge + 1
                iiFromVertex <<= toVertex[iFromEdge]

            with ovl.elif_(iiFromVertex > iiToVertex):
                iToEdge <<= iToEdge + 1
                iiToVertex <<= toVertex[iToEdge]

            with ovl.else_():
                nTriangle <<= nTriangle + 1
                iFromEdge <<= iFromEdge + 1
                iToEdge <<= iToEdge + 1
                iiFromVertex <<= toVertex[iFromEdge]
                iiToVertex <<= toVertex[iToEdge]

    #TODO([email protected]): Use a reduction function that computes a partial or complete sum.
    count[iEdge] = nTriangle  # Save the triangles for each edge.

    return count
Beispiel #7
0
def conv_1d(x, v, kernel_orientation='as-is', stride=1, mode='same', data_format='NCE'):
    """
    Define the operator function.

    :param x: An input tensor of shape [num_batches, num_channels, num_elements].
    :param v: A filter/kernel of shape [num_filters, num_channels, kernel_size].
    :param kernel_orientation: The orientation of the kernel to use: 'as-is' or 'flipped'. This language is used
        rather than 'convolution' or 'cross-correlation' since the terms have become overloaded and ambiguous across
        some fields. As defined in https://en.wikipedia.org/wiki/Cross-correlation#Properties, 'as-is' yields the
        cross-correlation and 'flipped' yields the convolution.
    :param stride: kernel stride to use.
    :param mode: border mode: 'same', 'valid', or 'full'
    :param data_format: order of the dimensions in the input: 'NCE', 'NEC' etc.
    :return: an output tensor of shape [num_batches, num_filters, num_elements]
    """

    if kernel_orientation != 'as-is' and kernel_orientation != 'flipped':
        raise ValueError("kernel_orientation must be 'as-is' or 'flipped'")

    # resolve data layout based on data_format input
    assert x.rank == 3
    assert len(data_format) == 3
    assert data_format.count('N') == 1
    assert data_format.count('C') == 1
    assert data_format.count('E') == 1

    n_axis = data_format.find('N')
    c_axis = data_format.find('C')
    e_axis = data_format.find('E')

    num_elements = x.shape[e_axis]
    num_channels = x.shape[c_axis]
    num_batches = x.shape[n_axis]

    assert v.rank == 3
    if num_channels != v.shape[c_axis]:
        raise ValueError('Channel axis size of input must match that of the filter.')

    num_filters = v.shape[n_axis]
    filter_size = v.shape[e_axis]
    left_apron = filter_size // 2
    right_apron = filter_size - left_apron - 1

    if not isinstance(stride, int) or stride < 1 or stride > num_elements:
        raise ValueError('Stride must be a positive integer')

    if mode == 'same':
        if filter_size > num_elements:
            raise ValueError('filter size, ' + str(filter_size) +
                             ',  cannot be larger than number of elements, ' + str(num_elements))

        starting_element = -left_apron
        ending_element = num_elements - left_apron
    elif mode == 'valid':
        if filter_size > num_elements:
            raise ValueError('filter size, ' + str(filter_size) +
                             ',  cannot be larger than number of elements, ' + str(num_elements))

        starting_element = 0
        ending_element = num_elements - (left_apron + right_apron)
    elif mode == 'full':
        starting_element = -(filter_size - 1)
        ending_element = num_elements
    else:
        raise ValueError("mode must be 'same', 'valid', or 'full'.")

    output_elements = (ending_element - starting_element)

    output_shape = [0, 0, 0]
    output_shape[n_axis] = num_batches
    output_shape[c_axis] = num_filters
    output_shape[e_axis] = output_elements
    output = ovl.output(output_shape, x.dtype)

    filters_per_worker = 1
    filter_workers, filter_remainder = divmod(num_filters, filters_per_worker)
    if filter_remainder > 0:
        filter_workers += 1

    batches_per_worker = 1
    batch_workers, batch_remainder = divmod(num_batches, batches_per_worker)
    if batch_remainder > 0:
        batch_workers += 1

    elements_per_worker = 10
    element_workers, element_remainder = divmod(output_elements, elements_per_worker)
    if element_remainder > 0:
        element_workers += 1

    workgroup_shape = [batch_workers, filter_workers, element_workers]
    ovl.logger.debug(u'    workgroup_shape: ' + str(workgroup_shape))
    pos = ovl.position_in(workgroup_shape)
    cur_batch_block = pos[0]
    cur_filter_block = pos[1]
    cur_element_block = pos[2]

    num_block_batches = ovl.variable(batches_per_worker, ovl.uint32)
    if batch_remainder > 0:
        with ovl.if_(cur_batch_block == batch_workers-1):
            num_block_batches <<= batch_remainder

    num_block_filters = ovl.variable(filters_per_worker, ovl.uint32)
    if filter_remainder > 0:
        with ovl.if_(cur_filter_block == filter_workers-1):
            num_block_filters <<= filter_remainder

    num_block_elements = ovl.variable(elements_per_worker, ovl.uint32)
    if element_remainder > 0:
        with ovl.if_(cur_element_block == element_workers-1):
            num_block_elements <<= element_remainder

    accum = ovl.zeros((batches_per_worker, filters_per_worker, elements_per_worker), ovl.float64) #4*4

    filter_block = ovl.zeros((filters_per_worker, filter_size), v.dtype)  #4*10
    input_block = ovl.zeros((batches_per_worker, filter_size), x.dtype)  #4*10
    for cur_channel in ovl.arange(num_channels):

        # load all filters for this channel
        for intra_block_filter in ovl.arange(filters_per_worker):
            for f_pos in ovl.arange(filter_size):
                filter_index = [None, None, None]
                filter_index[c_axis] = cur_channel
                filter_index[n_axis] = ovl.cast(intra_block_filter, ovl.uint32) + cur_filter_block * filters_per_worker
                if kernel_orientation == 'as-is':
                    filter_index[e_axis] = f_pos
                elif kernel_orientation == 'flipped':
                    filter_index[e_axis] = filter_size - f_pos - 1
                else:
                    raise ValueError("kernel_orientation must be 'as-is' or 'flipped'")
                filter_block[intra_block_filter, f_pos] = v[filter_index]

        # load initial inputs for this channel
        buffer_head = ovl.variable(0, ovl.uint32)
        for intra_block_batch in ovl.arange(num_block_batches):
            cur_batch = intra_block_batch + cur_batch_block*batches_per_worker
            for f_pos in ovl.arange(filter_size):
                x_index = [None, None, None]
                x_index[c_axis] = cur_channel
                x_index[n_axis] = cur_batch

                x_elem_index = starting_element + ovl.cast(cur_element_block * elements_per_worker, ovl.uint64) + ovl.cast(f_pos, ovl.uint64)
                x_index[e_axis] = x_elem_index
                index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements)
                with ovl.if_(index_in_bounds):
                    input_block[intra_block_batch, f_pos] = x[x_index]
                with ovl.else_():
                    input_block[intra_block_batch, f_pos] = 0

        for intra_block_element in ovl.arange(num_block_elements):
            cur_elem = intra_block_element + cur_element_block*elements_per_worker
            for intra_block_batch in ovl.arange(num_block_batches):
                cur_batch = intra_block_batch + cur_batch_block*batches_per_worker
                for intra_block_filter in ovl.arange(num_block_filters):
                    for f_pos in ovl.arange(filter_size):
                        x_pos = (buffer_head + ovl.cast(f_pos, ovl.uint32)) % filter_size
                        cur_x = ovl.cast(input_block[intra_block_batch, x_pos], ovl.float64)
                        cur_v = ovl.cast(filter_block[intra_block_filter, f_pos], ovl.float64)
                        accum[intra_block_batch, intra_block_filter, intra_block_element] = \
                            accum[intra_block_batch, intra_block_filter, intra_block_element] + cur_x * cur_v

                # load new element
                x_index = [None, None, None]
                x_index[c_axis] = cur_channel
                x_index[n_axis] = cur_batch
                x_elem_index = starting_element + cur_elem + filter_size
                x_index[e_axis] = x_elem_index
                index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements)
                with ovl.if_(index_in_bounds):
                    input_block[intra_block_batch, buffer_head] = x[x_index]
                with ovl.else_():
                    input_block[intra_block_batch, buffer_head] = 0

            buffer_head <<= (buffer_head + 1) % filter_size

    for intra_block_batch in ovl.arange(num_block_batches):
        cur_batch = intra_block_batch + cur_batch_block*batches_per_worker
        for intra_block_filter in ovl.arange(num_block_filters):
            cur_filter = intra_block_filter + cur_filter_block*filters_per_worker
            for intra_block_element in ovl.arange(num_block_elements):
                cur_elem = intra_block_element + cur_element_block*elements_per_worker

                output_index = [None, None, None]
                output_index[n_axis] = cur_batch
                output_index[e_axis] = cur_elem
                output_index[c_axis] = cur_filter
                output[output_index] = ovl.cast(accum[intra_block_batch, intra_block_filter, intra_block_element],
                                                output.dtype)

    return output
def triangles_op(startEdge, fromVertex, toVertex):
    """Counts the triangles in an undirected graph.

    Notice that this method assumes that the graph is given as an adjacency list where all lists with vertex neighbors
    are sorted.

    The parallel algorithm uses the following strategy. We map one thread per edge, This is also called the edge-based
    iterator strategy.

    The idea behind the algorithm is:
        1. Go over all edges (u, v).
        2. The neighboring indices for vertex u are N(u) and for vertex v are N(v).
        3. Increment the triangle counter by | N(u) /\ N(v) | where /\ is the set intersection operator.

    We enforce an order on the vertices that avoids counting the same triangle three times, instead each triangle is
    counted once.

    Attributes: None.

    The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the
    separate lists.

    :param startEdge: Indices into toVertex where edges start.
    :type startEdge: list.
    :param fromVertex: The from-vertex of each edge.
    :type fromVertex: list.
    :param toVertex: The to-vertex of each edge.
    :type toVertex: list.
    :return: Counts of triangles per edge.
    """
    iEdge           = ovl.position_in(toVertex.shape)[0]
    count           = ovl.output(toVertex.shape, ovl.uint64)
    nTriangle       = ovl.variable(0, ovl.uint64)

    iFromVertex     = ovl.variable(fromVertex[iEdge], fromVertex.dtype)
    iFromEdge       = ovl.variable(startEdge[iFromVertex], startEdge.dtype)
    iFromEdgeEnd    = ovl.variable(startEdge[iFromVertex + 1], startEdge.dtype)
    iiFromVertex    = ovl.variable(toVertex[iFromEdge], toVertex.dtype)

    iToVertex       = ovl.variable(toVertex[iEdge], toVertex.dtype)
    iToEdge         = ovl.variable(startEdge[iToVertex], startEdge.dtype)
    iToEdgeEnd      = ovl.variable(startEdge[iToVertex + 1], startEdge.dtype)
    iiToVertex      = ovl.variable(toVertex[iToEdge], toVertex.dtype)

    nMerge          = iToEdgeEnd-iToEdge + iFromEdgeEnd-iFromEdge # Maximum number of merges.

    # This construction is a work-around for simulating the function of a while loop.
    #TODO([email protected]): Replace this construct by a while loop once it is available in ovl.
    for iMerge in ovl.arange(nMerge):
        doMerge = ovl.logical_and(iFromEdge < iFromEdgeEnd, iToEdge < iToEdgeEnd)
        doMerge = ovl.logical_and(doMerge, iiFromVertex < iToVertex)

        with ovl.if_(doMerge):

            with ovl.if_(iiFromVertex < iiToVertex):
                iFromEdge <<= iFromEdge+1
                iiFromVertex <<= toVertex[iFromEdge]

            with ovl.elif_(iiFromVertex > iiToVertex):
                iToEdge <<= iToEdge+1
                iiToVertex <<= toVertex[iToEdge]

            with ovl.else_():
                nTriangle <<= nTriangle+1
                iFromEdge <<= iFromEdge+1
                iToEdge <<= iToEdge+1
                iiFromVertex <<= toVertex[iFromEdge]
                iiToVertex <<= toVertex[iToEdge]


    #TODO([email protected]): Use a reduction function that computes a partial or complete sum.
    count[iEdge] = nTriangle # Save the triangles for each edge.

    return count