def op(self, data, center): """The definition of the operator function. Thread pool over nData. This may generate too many threads for some input data. :param data: 2D matrix as data input with dimensions: nDim x nData. :type data; numpy array. :param center: 2D matrix of initial cluster centers with dimensions: nDim x nCenter. :type center: numpy array. :return minIndex: 1D matrix of assignemnts of data points to cluster centers: nData x 1. """ nDimData = data.shape[0] nDimCenter = center.shape[0] nData = data.shape[1] nCenter = center.shape[1] assert nDimData == nDimCenter, "Data has % dimensions and centers have %d dimensions, but these must match!" % (nDimData, nDimCenter) nDim = nDimData iSample = ops.position_in(nData)[0] minDist = ops.variable(sys.float_info.max, data.dtype) iMin = ops.variable(0, ops.int64) for iCenter in ops.arange(nCenter): dist = ops.variable(0, center.dtype) for iDim in ops.arange(nDim): dist <<= dist + (data[iDim,iSample]-center[iDim,iCenter])*(data[iDim,iSample]-center[iDim,iCenter]) with ops.if_(dist < minDist): iMin <<= iCenter minDist <<= dist #TODO([email protected]): Change this to uint64 whenever ovl supports non-floating point types for tensors. minIndex = ops.output(nData, ops.float64) # Use float64 because tensorflow does not support uint64 as type yet. minIndex[iSample] = ops.cast(iMin, ops.float64) return minIndex
def op(self, startEdge, fromVertex, toVertex): """The definition of the operator function. The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the separate lists. :param startEdge: Indices into toVertex where edges start. :type startEdge: list. :param fromVertex: The from-vertex of each edge. :type fromVertex: list. :param toVertex: The to-vertex of each edge. :type toVertex: list. :return Counts of triangles per edge. """ iEdge = ops.position_in(toVertex.shape)[0] count = ops.output(toVertex.shape, ops.uint64) nTriangle = ops.variable(0, ops.uint64) iFromVertex = ops.variable(fromVertex[iEdge], fromVertex.dtype) iFromEdge = ops.variable(startEdge[iFromVertex], startEdge.dtype) iFromEdgeEnd = ops.variable(startEdge[iFromVertex+1], startEdge.dtype) iiFromVertex = ops.variable(toVertex[iFromEdge], toVertex.dtype) iToVertex = ops.variable(toVertex[iEdge], toVertex.dtype) iToEdge = ops.variable(startEdge[iToVertex], startEdge.dtype) iToEdgeEnd = ops.variable(startEdge[iToVertex+1], startEdge.dtype) iiToVertex = ops.variable(toVertex[iToEdge], toVertex.dtype) nMerge = iToEdgeEnd-iToEdge + iFromEdgeEnd-iFromEdge # Maximum number of merges. # This construction is a work-around for simulating the function of a while loop. #TODO([email protected]): Replace this construct by a while loop once it is available in ovl. for iMerge in ops.arange(nMerge): doMerge = ops.logical_and(iFromEdge < iFromEdgeEnd, iToEdge < iToEdgeEnd) doMerge = ops.logical_and(doMerge, iiFromVertex < iToVertex) with ops.if_(doMerge): with ops.if_(iiFromVertex < iiToVertex): iFromEdge <<= iFromEdge+1 iiFromVertex <<= toVertex[iFromEdge] with ops.elif_(iiFromVertex > iiToVertex): iToEdge <<= iToEdge+1 iiToVertex <<= toVertex[iToEdge] with ops.else_(): nTriangle <<= nTriangle+1 iFromEdge <<= iFromEdge+1 iToEdge <<= iToEdge+1 iiFromVertex <<= toVertex[iFromEdge] iiToVertex <<= toVertex[iToEdge] #TODO([email protected]): Use a reduction function that computes a partial or complete sum. count[iEdge] = nTriangle # Save the triangles for each edge. return count
def op(self, data, minIndex, nCenter): """The definition of the operator function. Thread pool over nCenter. That should be fine for most cases. :param data: 2D matrix as data input with dimensions: nDim x nData. :type data: numpy array. :param minIndex: 1D matrix of assignemnts of data points to cluster centers: nData x 1. :type minIndex: numpy array. :return 2D matrix with computed cluster centers with dimensions> nDim x nCenter. """ nDim = data.shape[0] nData = data.shape[1] assert nData==minIndex.shape[0], "Data has %d values and minDist has %d values, but these must match!" % (nData, minIndex.shape[0]) iCenter = ops.position_in(nCenter)[0] center = ops.zeros([nDim,nCenter], data.dtype) count = ops.variable(0, data.dtype) for iSample in ops.arange(nData): with ops.if_(iCenter==ops.cast(minIndex[iSample], ops.uint32)): count <<= count + 1 for iDim in ops.arange(nDim): center[iDim,iCenter] = center[iDim,iCenter] + data[iDim,iSample] newCenter = ops.output([nDim,nCenter],data.dtype) for iDim in ops.arange(nDim): newCenter[iDim,iCenter] = center[iDim,iCenter]/count return newCenter
def sig_grad(arg): valid_grad = ovl.logical_and(arg > -50, arg < 50) result = ovl.variable(0, arg.dtype) with ovl.if_(valid_grad): e = ovl.exp(-arg) result <<= e/((1+e)*(1+e)) return result
def accumulate(x, inner_fcn=None, axis=None): """ Define the operator function. :param x: The input tensor :param inner_fcn: a lambda function to be applied for accumulation :param axis: The axis across which accumulation will be applied :return: The accumulated result """ # assert that the axis parameter makes sense assert isinstance(axis, int) assert axis >= 0 assert axis < x.rank # Define the workgroup shape. Here we use a single worker to perform the accumulation across the # accumulation axis. The workgroup shape is then the size of all other axes with accumulation axis removed. if x.rank is 1: workgroup_shape = [1] else: workgroup_shape = [] for cur_dim, num_elements in enumerate(x.shape): if cur_dim == axis: pass else: workgroup_shape.append(num_elements) pos = ovl.position_in(workgroup_shape) # Define the accumulated output to be the same type as the input out = ovl.output_like(x) # Define a function for determining the index of the input tensor as a function of accumulation axis position # and the current worker position. This is equal to the worker position with the accumulation axis position # inserted where it should be in the indexing order. def resolve_position(axis_n): cur_pos = [] offset = 0 for cur_dim in range(x.rank): if cur_dim == axis: cur_pos.append(axis_n) offset = 1 else: cur_pos.append(pos[cur_dim - offset]) return cur_pos # initialize accumulator to be the first element along the accumulation axis initial_value = x[resolve_position(0)] accum = ovl.variable(initial_value, x.dtype) out[resolve_position(0)] = accum # use this worker to iterate over and accumulate the rest of the elements in the accumulation axis for i in ovl.arange(1, x.shape[axis]): accum <<= inner_fcn(accum, x[resolve_position(i)]) out[resolve_position(i)] = accum return out
def accumulate(x, inner_fcn=None, axis=None): """ Define the operator function. :param x: The input tensor :param inner_fcn: a lambda function to be applied for accumulation :param axis: The axis across which accumulation will be applied :return: The accumulated result """ # assert that the axis parameter makes sense assert isinstance(axis, int) assert axis >= 0 assert axis < x.rank # Define the workgroup shape. Here we use a single worker to perform the accumulation across the # accumulation axis. The workgroup shape is then the size of all other axes with accumulation axis removed. if x.rank is 1: workgroup_shape = [1] else: workgroup_shape = [] for cur_dim, num_elements in enumerate(x.shape): if cur_dim == axis: pass else: workgroup_shape.append(num_elements) pos = ovl.position_in(workgroup_shape) # Define the accumulated output to be the same type as the input out = ovl.output_like(x) # Define a function for determining the index of the input tensor as a function of accumulation axis position # and the current worker position. This is equal to the worker position with the accumulation axis position # inserted where it should be in the indexing order. def resolve_position(axis_n): cur_pos = [] offset = 0 for cur_dim in range(x.rank): if cur_dim == axis: cur_pos.append(axis_n) offset = 1 else: cur_pos.append(pos[cur_dim-offset]) return cur_pos # initialize accumulator to be the first element along the accumulation axis initial_value = x[resolve_position(0)] accum = ovl.variable(initial_value, x.dtype) out[resolve_position(0)] = accum # use this worker to iterate over and accumulate the rest of the elements in the accumulation axis for i in ovl.arange(1, x.shape[axis]): accum <<= inner_fcn(accum, x[resolve_position(i)]) out[resolve_position(i)] = accum return out
def conv_1d(x, v, kernel_orientation='as-is', stride=1, mode='same', data_format='NCE'): """ Define the operator function. :param x: An input tensor of shape [num_batches, num_channels, num_elements]. :param v: A filter/kernel of shape [num_filters, num_channels, kernel_size]. :param kernel_orientation: The orientation of the kernel to use: 'as-is' or 'flipped'. This language is used rather than 'convolution' or 'cross-correlation' since the terms have become overloaded and ambiguous across some fields. As defined in https://en.wikipedia.org/wiki/Cross-correlation#Properties, 'as-is' yields the cross-correlation and 'flipped' yields the convolution. :param stride: kernel stride to use. :param mode: border mode: 'same', 'valid', or 'full' :param data_format: order of the dimensions in the input: 'NCE', 'NEC' etc. :return: an output tensor of shape [num_batches, num_filters, num_elements] """ if kernel_orientation != 'as-is' and kernel_orientation != 'flipped': raise ValueError("kernel_orientation must be 'as-is' or 'flipped'") # resolve data layout based on data_format input assert x.rank == 3 assert len(data_format) == 3 assert data_format.count('N') == 1 assert data_format.count('C') == 1 assert data_format.count('E') == 1 n_axis = data_format.find('N') c_axis = data_format.find('C') e_axis = data_format.find('E') num_elements = x.shape[e_axis] num_channels = x.shape[c_axis] num_batches = x.shape[n_axis] assert v.rank == 3 if num_channels != v.shape[c_axis]: raise ValueError( 'Channel axis size of input must match that of the filter.') num_filters = v.shape[n_axis] filter_size = v.shape[e_axis] left_apron = filter_size // 2 right_apron = filter_size - left_apron - 1 if not isinstance(stride, int) or stride < 1 or stride > num_elements: raise ValueError('Stride must be a positive integer') if mode == 'same': if filter_size > num_elements: raise ValueError('filter size, ' + str(filter_size) + ', cannot be larger than number of elements, ' + str(num_elements)) starting_element = -left_apron ending_element = num_elements - left_apron elif mode == 'valid': if filter_size > num_elements: raise ValueError('filter size, ' + str(filter_size) + ', cannot be larger than number of elements, ' + str(num_elements)) starting_element = 0 ending_element = num_elements - (left_apron + right_apron) elif mode == 'full': starting_element = -(filter_size - 1) ending_element = num_elements else: raise ValueError("mode must be 'same', 'valid', or 'full'.") output_elements = (ending_element - starting_element) output_shape = [0, 0, 0] output_shape[n_axis] = num_batches output_shape[c_axis] = num_filters output_shape[e_axis] = output_elements output = ovl.output(output_shape, x.dtype) filters_per_worker = 1 filter_workers, filter_remainder = divmod(num_filters, filters_per_worker) if filter_remainder > 0: filter_workers += 1 batches_per_worker = 1 batch_workers, batch_remainder = divmod(num_batches, batches_per_worker) if batch_remainder > 0: batch_workers += 1 elements_per_worker = 10 element_workers, element_remainder = divmod(output_elements, elements_per_worker) if element_remainder > 0: element_workers += 1 workgroup_shape = [batch_workers, filter_workers, element_workers] ovl.logger.debug(u' workgroup_shape: ' + str(workgroup_shape)) pos = ovl.position_in(workgroup_shape) cur_batch_block = pos[0] cur_filter_block = pos[1] cur_element_block = pos[2] num_block_batches = ovl.variable(batches_per_worker, ovl.uint32) if batch_remainder > 0: with ovl.if_(cur_batch_block == batch_workers - 1): num_block_batches <<= batch_remainder num_block_filters = ovl.variable(filters_per_worker, ovl.uint32) if filter_remainder > 0: with ovl.if_(cur_filter_block == filter_workers - 1): num_block_filters <<= filter_remainder num_block_elements = ovl.variable(elements_per_worker, ovl.uint32) if element_remainder > 0: with ovl.if_(cur_element_block == element_workers - 1): num_block_elements <<= element_remainder accum = ovl.zeros( (batches_per_worker, filters_per_worker, elements_per_worker), ovl.float64) #4*4 filter_block = ovl.zeros((filters_per_worker, filter_size), v.dtype) #4*10 input_block = ovl.zeros((batches_per_worker, filter_size), x.dtype) #4*10 for cur_channel in ovl.arange(num_channels): # load all filters for this channel for intra_block_filter in ovl.arange(filters_per_worker): for f_pos in ovl.arange(filter_size): filter_index = [None, None, None] filter_index[c_axis] = cur_channel filter_index[n_axis] = ovl.cast( intra_block_filter, ovl.uint32) + cur_filter_block * filters_per_worker if kernel_orientation == 'as-is': filter_index[e_axis] = f_pos elif kernel_orientation == 'flipped': filter_index[e_axis] = filter_size - f_pos - 1 else: raise ValueError( "kernel_orientation must be 'as-is' or 'flipped'") filter_block[intra_block_filter, f_pos] = v[filter_index] # load initial inputs for this channel buffer_head = ovl.variable(0, ovl.uint32) for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block * batches_per_worker for f_pos in ovl.arange(filter_size): x_index = [None, None, None] x_index[c_axis] = cur_channel x_index[n_axis] = cur_batch x_elem_index = starting_element + ovl.cast( cur_element_block * elements_per_worker, ovl.uint64) + ovl.cast(f_pos, ovl.uint64) x_index[e_axis] = x_elem_index index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements) with ovl.if_(index_in_bounds): input_block[intra_block_batch, f_pos] = x[x_index] with ovl.else_(): input_block[intra_block_batch, f_pos] = 0 for intra_block_element in ovl.arange(num_block_elements): cur_elem = intra_block_element + cur_element_block * elements_per_worker for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block * batches_per_worker for intra_block_filter in ovl.arange(num_block_filters): for f_pos in ovl.arange(filter_size): x_pos = (buffer_head + ovl.cast(f_pos, ovl.uint32)) % filter_size cur_x = ovl.cast(input_block[intra_block_batch, x_pos], ovl.float64) cur_v = ovl.cast( filter_block[intra_block_filter, f_pos], ovl.float64) accum[intra_block_batch, intra_block_filter, intra_block_element] = \ accum[intra_block_batch, intra_block_filter, intra_block_element] + cur_x * cur_v # load new element x_index = [None, None, None] x_index[c_axis] = cur_channel x_index[n_axis] = cur_batch x_elem_index = starting_element + cur_elem + filter_size x_index[e_axis] = x_elem_index index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements) with ovl.if_(index_in_bounds): input_block[intra_block_batch, buffer_head] = x[x_index] with ovl.else_(): input_block[intra_block_batch, buffer_head] = 0 buffer_head <<= (buffer_head + 1) % filter_size for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block * batches_per_worker for intra_block_filter in ovl.arange(num_block_filters): cur_filter = intra_block_filter + cur_filter_block * filters_per_worker for intra_block_element in ovl.arange(num_block_elements): cur_elem = intra_block_element + cur_element_block * elements_per_worker output_index = [None, None, None] output_index[n_axis] = cur_batch output_index[e_axis] = cur_elem output_index[c_axis] = cur_filter output[output_index] = ovl.cast( accum[intra_block_batch, intra_block_filter, intra_block_element], output.dtype) return output
def graph_triangle_count(startEdge, fromVertex, toVertex): """Counts the triangles in an undirected graph. Notice that this method assumes that the graph is given as an adjacency list where all lists with vertex neighbors are sorted. The parallel algorithm uses the following strategy. We map one thread per edge, This is also called the edge-based iterator strategy. The idea behind the algorithm is: 1. Go over all edges (u, v). 2. The neighboring indices for vertex u are N(u) and for vertex v are N(v). 3. Increment the triangle counter by | N(u) /\ N(v) | where /\ is the set intersection operator. We enforce an order on the vertices that avoids counting the same triangle three times, instead each triangle is counted once. Attributes: None. The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the separate lists. :param startEdge: Indices into toVertex where edges start. :type startEdge: list. :param fromVertex: The from-vertex of each edge. :type fromVertex: list. :param toVertex: The to-vertex of each edge. :type toVertex: list. :return: Counts of triangles per edge. """ iEdge = ovl.position_in(toVertex.shape)[0] count = ovl.output(toVertex.shape, ovl.uint64) nTriangle = ovl.variable(0, ovl.uint64) iFromVertex = ovl.variable(fromVertex[iEdge], fromVertex.dtype) iFromEdge = ovl.variable(startEdge[iFromVertex], startEdge.dtype) iFromEdgeEnd = ovl.variable(startEdge[iFromVertex + 1], startEdge.dtype) iiFromVertex = ovl.variable(toVertex[iFromEdge], toVertex.dtype) iToVertex = ovl.variable(toVertex[iEdge], toVertex.dtype) iToEdge = ovl.variable(startEdge[iToVertex], startEdge.dtype) iToEdgeEnd = ovl.variable(startEdge[iToVertex + 1], startEdge.dtype) iiToVertex = ovl.variable(toVertex[iToEdge], toVertex.dtype) nMerge = iToEdgeEnd - iToEdge + iFromEdgeEnd - iFromEdge # Maximum number of merges. # This construction is a work-around for simulating the function of a while loop. #TODO([email protected]): Replace this construct by a while loop once it is available in ovl. for iMerge in ovl.arange(nMerge): doMerge = ovl.logical_and(iFromEdge < iFromEdgeEnd, iToEdge < iToEdgeEnd) doMerge = ovl.logical_and(doMerge, iiFromVertex < iToVertex) with ovl.if_(doMerge): with ovl.if_(iiFromVertex < iiToVertex): iFromEdge <<= iFromEdge + 1 iiFromVertex <<= toVertex[iFromEdge] with ovl.elif_(iiFromVertex > iiToVertex): iToEdge <<= iToEdge + 1 iiToVertex <<= toVertex[iToEdge] with ovl.else_(): nTriangle <<= nTriangle + 1 iFromEdge <<= iFromEdge + 1 iToEdge <<= iToEdge + 1 iiFromVertex <<= toVertex[iFromEdge] iiToVertex <<= toVertex[iToEdge] #TODO([email protected]): Use a reduction function that computes a partial or complete sum. count[iEdge] = nTriangle # Save the triangles for each edge. return count
def conv_1d(x, v, kernel_orientation='as-is', stride=1, mode='same', data_format='NCE'): """ Define the operator function. :param x: An input tensor of shape [num_batches, num_channels, num_elements]. :param v: A filter/kernel of shape [num_filters, num_channels, kernel_size]. :param kernel_orientation: The orientation of the kernel to use: 'as-is' or 'flipped'. This language is used rather than 'convolution' or 'cross-correlation' since the terms have become overloaded and ambiguous across some fields. As defined in https://en.wikipedia.org/wiki/Cross-correlation#Properties, 'as-is' yields the cross-correlation and 'flipped' yields the convolution. :param stride: kernel stride to use. :param mode: border mode: 'same', 'valid', or 'full' :param data_format: order of the dimensions in the input: 'NCE', 'NEC' etc. :return: an output tensor of shape [num_batches, num_filters, num_elements] """ if kernel_orientation != 'as-is' and kernel_orientation != 'flipped': raise ValueError("kernel_orientation must be 'as-is' or 'flipped'") # resolve data layout based on data_format input assert x.rank == 3 assert len(data_format) == 3 assert data_format.count('N') == 1 assert data_format.count('C') == 1 assert data_format.count('E') == 1 n_axis = data_format.find('N') c_axis = data_format.find('C') e_axis = data_format.find('E') num_elements = x.shape[e_axis] num_channels = x.shape[c_axis] num_batches = x.shape[n_axis] assert v.rank == 3 if num_channels != v.shape[c_axis]: raise ValueError('Channel axis size of input must match that of the filter.') num_filters = v.shape[n_axis] filter_size = v.shape[e_axis] left_apron = filter_size // 2 right_apron = filter_size - left_apron - 1 if not isinstance(stride, int) or stride < 1 or stride > num_elements: raise ValueError('Stride must be a positive integer') if mode == 'same': if filter_size > num_elements: raise ValueError('filter size, ' + str(filter_size) + ', cannot be larger than number of elements, ' + str(num_elements)) starting_element = -left_apron ending_element = num_elements - left_apron elif mode == 'valid': if filter_size > num_elements: raise ValueError('filter size, ' + str(filter_size) + ', cannot be larger than number of elements, ' + str(num_elements)) starting_element = 0 ending_element = num_elements - (left_apron + right_apron) elif mode == 'full': starting_element = -(filter_size - 1) ending_element = num_elements else: raise ValueError("mode must be 'same', 'valid', or 'full'.") output_elements = (ending_element - starting_element) output_shape = [0, 0, 0] output_shape[n_axis] = num_batches output_shape[c_axis] = num_filters output_shape[e_axis] = output_elements output = ovl.output(output_shape, x.dtype) filters_per_worker = 1 filter_workers, filter_remainder = divmod(num_filters, filters_per_worker) if filter_remainder > 0: filter_workers += 1 batches_per_worker = 1 batch_workers, batch_remainder = divmod(num_batches, batches_per_worker) if batch_remainder > 0: batch_workers += 1 elements_per_worker = 10 element_workers, element_remainder = divmod(output_elements, elements_per_worker) if element_remainder > 0: element_workers += 1 workgroup_shape = [batch_workers, filter_workers, element_workers] ovl.logger.debug(u' workgroup_shape: ' + str(workgroup_shape)) pos = ovl.position_in(workgroup_shape) cur_batch_block = pos[0] cur_filter_block = pos[1] cur_element_block = pos[2] num_block_batches = ovl.variable(batches_per_worker, ovl.uint32) if batch_remainder > 0: with ovl.if_(cur_batch_block == batch_workers-1): num_block_batches <<= batch_remainder num_block_filters = ovl.variable(filters_per_worker, ovl.uint32) if filter_remainder > 0: with ovl.if_(cur_filter_block == filter_workers-1): num_block_filters <<= filter_remainder num_block_elements = ovl.variable(elements_per_worker, ovl.uint32) if element_remainder > 0: with ovl.if_(cur_element_block == element_workers-1): num_block_elements <<= element_remainder accum = ovl.zeros((batches_per_worker, filters_per_worker, elements_per_worker), ovl.float64) #4*4 filter_block = ovl.zeros((filters_per_worker, filter_size), v.dtype) #4*10 input_block = ovl.zeros((batches_per_worker, filter_size), x.dtype) #4*10 for cur_channel in ovl.arange(num_channels): # load all filters for this channel for intra_block_filter in ovl.arange(filters_per_worker): for f_pos in ovl.arange(filter_size): filter_index = [None, None, None] filter_index[c_axis] = cur_channel filter_index[n_axis] = ovl.cast(intra_block_filter, ovl.uint32) + cur_filter_block * filters_per_worker if kernel_orientation == 'as-is': filter_index[e_axis] = f_pos elif kernel_orientation == 'flipped': filter_index[e_axis] = filter_size - f_pos - 1 else: raise ValueError("kernel_orientation must be 'as-is' or 'flipped'") filter_block[intra_block_filter, f_pos] = v[filter_index] # load initial inputs for this channel buffer_head = ovl.variable(0, ovl.uint32) for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block*batches_per_worker for f_pos in ovl.arange(filter_size): x_index = [None, None, None] x_index[c_axis] = cur_channel x_index[n_axis] = cur_batch x_elem_index = starting_element + ovl.cast(cur_element_block * elements_per_worker, ovl.uint64) + ovl.cast(f_pos, ovl.uint64) x_index[e_axis] = x_elem_index index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements) with ovl.if_(index_in_bounds): input_block[intra_block_batch, f_pos] = x[x_index] with ovl.else_(): input_block[intra_block_batch, f_pos] = 0 for intra_block_element in ovl.arange(num_block_elements): cur_elem = intra_block_element + cur_element_block*elements_per_worker for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block*batches_per_worker for intra_block_filter in ovl.arange(num_block_filters): for f_pos in ovl.arange(filter_size): x_pos = (buffer_head + ovl.cast(f_pos, ovl.uint32)) % filter_size cur_x = ovl.cast(input_block[intra_block_batch, x_pos], ovl.float64) cur_v = ovl.cast(filter_block[intra_block_filter, f_pos], ovl.float64) accum[intra_block_batch, intra_block_filter, intra_block_element] = \ accum[intra_block_batch, intra_block_filter, intra_block_element] + cur_x * cur_v # load new element x_index = [None, None, None] x_index[c_axis] = cur_channel x_index[n_axis] = cur_batch x_elem_index = starting_element + cur_elem + filter_size x_index[e_axis] = x_elem_index index_in_bounds = ovl.logical_and(x_elem_index >= 0, x_elem_index < num_elements) with ovl.if_(index_in_bounds): input_block[intra_block_batch, buffer_head] = x[x_index] with ovl.else_(): input_block[intra_block_batch, buffer_head] = 0 buffer_head <<= (buffer_head + 1) % filter_size for intra_block_batch in ovl.arange(num_block_batches): cur_batch = intra_block_batch + cur_batch_block*batches_per_worker for intra_block_filter in ovl.arange(num_block_filters): cur_filter = intra_block_filter + cur_filter_block*filters_per_worker for intra_block_element in ovl.arange(num_block_elements): cur_elem = intra_block_element + cur_element_block*elements_per_worker output_index = [None, None, None] output_index[n_axis] = cur_batch output_index[e_axis] = cur_elem output_index[c_axis] = cur_filter output[output_index] = ovl.cast(accum[intra_block_batch, intra_block_filter, intra_block_element], output.dtype) return output
def triangles_op(startEdge, fromVertex, toVertex): """Counts the triangles in an undirected graph. Notice that this method assumes that the graph is given as an adjacency list where all lists with vertex neighbors are sorted. The parallel algorithm uses the following strategy. We map one thread per edge, This is also called the edge-based iterator strategy. The idea behind the algorithm is: 1. Go over all edges (u, v). 2. The neighboring indices for vertex u are N(u) and for vertex v are N(v). 3. Increment the triangle counter by | N(u) /\ N(v) | where /\ is the set intersection operator. We enforce an order on the vertices that avoids counting the same triangle three times, instead each triangle is counted once. Attributes: None. The array toVertex is a flattened list of lists structure, where startEdge encodes the start indices of the separate lists. :param startEdge: Indices into toVertex where edges start. :type startEdge: list. :param fromVertex: The from-vertex of each edge. :type fromVertex: list. :param toVertex: The to-vertex of each edge. :type toVertex: list. :return: Counts of triangles per edge. """ iEdge = ovl.position_in(toVertex.shape)[0] count = ovl.output(toVertex.shape, ovl.uint64) nTriangle = ovl.variable(0, ovl.uint64) iFromVertex = ovl.variable(fromVertex[iEdge], fromVertex.dtype) iFromEdge = ovl.variable(startEdge[iFromVertex], startEdge.dtype) iFromEdgeEnd = ovl.variable(startEdge[iFromVertex + 1], startEdge.dtype) iiFromVertex = ovl.variable(toVertex[iFromEdge], toVertex.dtype) iToVertex = ovl.variable(toVertex[iEdge], toVertex.dtype) iToEdge = ovl.variable(startEdge[iToVertex], startEdge.dtype) iToEdgeEnd = ovl.variable(startEdge[iToVertex + 1], startEdge.dtype) iiToVertex = ovl.variable(toVertex[iToEdge], toVertex.dtype) nMerge = iToEdgeEnd-iToEdge + iFromEdgeEnd-iFromEdge # Maximum number of merges. # This construction is a work-around for simulating the function of a while loop. #TODO([email protected]): Replace this construct by a while loop once it is available in ovl. for iMerge in ovl.arange(nMerge): doMerge = ovl.logical_and(iFromEdge < iFromEdgeEnd, iToEdge < iToEdgeEnd) doMerge = ovl.logical_and(doMerge, iiFromVertex < iToVertex) with ovl.if_(doMerge): with ovl.if_(iiFromVertex < iiToVertex): iFromEdge <<= iFromEdge+1 iiFromVertex <<= toVertex[iFromEdge] with ovl.elif_(iiFromVertex > iiToVertex): iToEdge <<= iToEdge+1 iiToVertex <<= toVertex[iToEdge] with ovl.else_(): nTriangle <<= nTriangle+1 iFromEdge <<= iFromEdge+1 iToEdge <<= iToEdge+1 iiFromVertex <<= toVertex[iFromEdge] iiToVertex <<= toVertex[iToEdge] #TODO([email protected]): Use a reduction function that computes a partial or complete sum. count[iEdge] = nTriangle # Save the triangles for each edge. return count