def __init__(self, graph): nodes = graph.nodes edges = graph.edges self.n_node = len(nodes) ''' add phantom label if none exists to facilitate C++ interop ''' if len(nodes.columns) == 0: nodes = nodes.assign(labeled=lambda _: False) if len(edges.columns) == 1: assert(edges.columns[0] == '!ij') edges = edges.assign(labeled=lambda _: False) ''' determine node type ''' self.node_type = node_type = rowtype(nodes) self.node = umlike(nodes[list(node_type.names)] .to_records(index=False).astype(node_type)) ''' determine whether graph is weighted, determine edge type, and compute node degrees ''' self.degree = degree = umzeros(self.padded_size, dtype=np.float32) edge_label_type = rowtype(edges, exclude=['!ij', '!w']) if '!w' in edges.columns: # weighted graph self.weighted = True edge_type = np.dtype([('weight', np.float32), ('label', edge_label_type)], align=True) self.edge_type = edge_type for (i, j), w in zip(edges['!ij'], edges['!w']): degree[i] += w degree[j] += w else: self.weighted = False self.edge_type = edge_type = edge_label_type for i, j in edges['!ij']: degree[i] += 1.0 degree[j] += 1.0 ''' collect non-zero edge octiles ''' uniq_oct = np.unique([(i - i % 8, j - j % 8) for i, j in edges['!ij']], axis=0) uniq_oct = np.unique(np.vstack((uniq_oct, uniq_oct[:, -1::-1])), axis=0) octile_dict = {(upper, left): [np.uint64(), umzeros(64, edge_type)] for upper, left in uniq_oct} for index, row in edges.iterrows(): i, j = row['!ij'] if self.weighted: edge = (row['!w'], tuple(row[key] for key in edge_type['label'].names)) else: edge = tuple(row[key] for key in edge_type.names) r = i % 8 c = j % 8 upper = i - r left = j - c octile_dict[(upper, left)][0] |= np.uint64(1 << (r * 8 + c)) octile_dict[(upper, left)][1][r + c * 8] = edge octile_dict[(left, upper)][0] |= np.uint64(1 << (c * 8 + r)) octile_dict[(left, upper)][1][c + r * 8] = edge ''' create edge octiles on GPU ''' self.octile_list = [Octile(upper, left, nzmask, elements) for (upper, left), (nzmask, elements) in octile_dict.items()] self.n_octile = len(self.octile_list) ''' collect edge octile structures into continuous buffer ''' self.octile_hdr = umlike(np.array([x.state for x in self.octile_list], Octile.dtype))
def __init__(self, graph): self.nodes = nodes = graph.nodes.copy(deep=False) self.edges = edges = graph.edges.copy(deep=False) self.n_node = len(nodes) ''' substitute columns corresponding to object-type node/edge attributes to their GPU counterparts ''' for df in [nodes, edges]: for key in list(df.columns): if not np.issctype(df[key].dtype): if issubclass(df[key].concrete_type, (list, tuple, np.ndarray)): inner_type = common_min_type.of_types([ x.dtype if isinstance(x, np.ndarray) else common_min_type.of_values(x) for x in df[key] ]) if not np.issctype(inner_type): raise (TypeError( f'Expect scalar elements in tuple or list' f'atttributes, got {inner_type}.')) if not np.issctype(inner_type): raise TypeError( f'List-like graphs attribute must have scalar' f'elements. Attribute {key} is {inner_type}.') buffer = memoryview( umlike( np.fromiter(it.chain.from_iterable(df[key]), dtype=inner_type))) size = np.fromiter(map(len, df[key]), dtype=np.int) head = np.cumsum(size) - size # mangle key with type information tag = '${key}::frozen_array::{dtype}'.format( key=key, dtype=inner_type.str) data = np.empty_like(df[key], dtype=np.object) for i, (h, s) in enumerate(zip(head, size)): data[i] = np.frombuffer( buffer[h:h + s], dtype=inner_type).view( self.CustomType.FrozenArray) df[tag] = data df.drop([key], inplace=True) else: raise TypeError( f'Unsupported non-scalar attribute {key} ' f'of type {df[key].concrete_type}') ''' add phantom label if none exists to facilitate C++ interop ''' assert (len(nodes.columns) >= 1) if len(nodes.columns) == 1: nodes['labeled'] = np.zeros(len(nodes), np.bool_) assert (len(edges.columns) >= 2) if len(edges.columns) == 2: assert ('!i' in edges and '!j' in edges) edges['labeled'] = np.zeros(len(edges), np.bool_) ''' determine node type ''' i = nodes['!i'] nodes.drop(['!i'], inplace=True) self.node_t = node_t = nodes.rowtype() self.nodes_aos = umempty(len(nodes), dtype=node_t) self.nodes_aos[i] = list(nodes.iterstates()) ''' determine whether graph is weighted, determine edge type, and compute node degrees ''' self.degree = degree = umzeros(self.n_node, dtype=np.float32) edge_t = edges.drop(['!i', '!j', '!w']).rowtype() self_loops = edges[edges['!i'] == edges['!j']] nnz = len(edges) if '!w' in edges: # weighted graph self.weighted = True np.add.at(degree, edges['!i'], edges['!w']) np.add.at(degree, edges['!j'], edges['!w']) np.subtract.at(degree, self_loops['!i'], self_loops['!w']) if edge_t.itemsize != 0: labels = list(edges[edge_t.names].iterstates()) else: labels = [None] * len(edges) edge_t = np.dtype([('weight', np.float32), ('label', edge_t)], align=True) edges_aos = np.fromiter(zip(edges['!w'], labels), dtype=edge_t, count=nnz) else: self.weighted = False np.add.at(degree, edges['!i'], 1.0) np.add.at(degree, edges['!j'], 1.0) np.subtract.at(degree, self_loops['!i'], 1.0) edges_aos = np.fromiter(edges[edge_t.names].iterstates(), dtype=edge_t, count=nnz) self.edge_t = edge_t degree[degree == 0] = 1.0 ''' collect non-zero edge octiles ''' indices = np.empty((4, nnz * 2), dtype=np.uint32, order='C') i, j, up, lf = indices i[:nnz] = edges['!i'] j[:nnz] = edges['!j'] # replicate & swap i and j for the lower triangular part i[nnz:], j[nnz:] = j[:nnz], i[:nnz] # get upper left corner of owner octiles up[:] = i - i % 8 lf[:] = j - j % 8 # np.unique implies lexical sort (lf, up, j, i), perm = np.unique(indices[-1::-1, :], axis=1, return_index=True) self.edges_aos = umempty(len(i), edge_t) self.edges_aos[:] = edges_aos[perm % nnz] # mod nnz due to symmetry diff = np.empty_like(up) diff[1:] = (up[:-1] != up[1:]) | (lf[:-1] != lf[1:]) diff[:1] = True oct_offset = np.flatnonzero(diff) self.n_octile = len(oct_offset) nzmasks = np.bitwise_or.reduceat( 1 << (i - up + (j - lf) * 8).astype(np.uint64), oct_offset) nzmasks_r = np.bitwise_or.reduceat( 1 << (j - lf + (i - up) * 8).astype(np.uint64), oct_offset) self.octiles = octiles = umempty(self.n_octile, self.Octile.dtype) octiles[:] = list( zip( int(self.edges_aos.base) + oct_offset * edge_t.itemsize, nzmasks, nzmasks_r, up[oct_offset], lf[oct_offset]))
def __call__(self, graphs, diags, node_kernel, edge_kernel, p, q, eps, ftol, gtol, jobs, starts, gramian, active, gradient, nX, nY, nJ, traits, timer): ''' transfer graphs and starting probabilities to GPU ''' timer.tic('transferring graphs to GPU') og_last = None graphs_d = umempty(len(graphs), dtype=OctileGraph.dtype) for i, g in enumerate(graphs): og, ogstate = self._register_graph(g) if i > 0: self._assert_homogeneous(og_last, og) og_last = og graphs_d[i] = ogstate weighted = og_last.weighted node_t = og_last.node_t edge_t = og_last.edge_t timer.toc('transferring graphs to GPU') ''' allocate global job counter ''' timer.tic('allocate global job counter') i_job_global = umzeros(1, np.uint32) timer.toc('allocate global job counter') ''' code generation ''' timer.tic('code generation') if weighted: edge_kernel = TensorProduct(weight=Product(), label=edge_kernel) use_theta_grid = traits.eval_gradient is True node_kernel_src = self.gencode_kernel(node_kernel, 'node_kernel') edge_kernel_src = self.gencode_kernel(edge_kernel, 'edge_kernel') p_start_src = self.gencode_probability(p, 'p_start') with self.template.context(traits=traits) as template: self.source = template.render( node_kernel=node_kernel_src, edge_kernel=edge_kernel_src, p_start=p_start_src, node_t=decltype(node_t), edge_t=decltype(edge_t) ) timer.toc('code generation') ''' JIT ''' timer.tic('JIT') kernel = self.module.get_function('graph_maximin_distance') timer.toc('JIT') ''' calculate launch configuration ''' timer.tic('calculating launch configuration') launch_block_count = (self.device.MULTIPROCESSOR_COUNT * self.block_per_sm) shmem_bytes_per_warp = self.module.get_global( 'shmem_bytes_per_warp' )[1] shmem_bytes_per_block = (shmem_bytes_per_warp * self.block_size // self.device.WARP_SIZE) max_graph_size = np.max([len(g.nodes) for g in graphs]) scratch_pcg = self.allocate_pcg_scratch( launch_block_count, max_graph_size ) ''' copy micro kernel parameters to GPU ''' for name, uker in [('node_kernel', node_kernel), ('edge_kernel', edge_kernel)]: states = np.array( self.pack_state(uker, diff_grid=use_theta_grid, diff_eps=eps), dtype=uker.dtype ) p_uker, _ = self.module.get_global(name) cuda.memcpy_htod(p_uker, states[:1]) if use_theta_grid: p_diff_grid, _ = self.module.get_global(f'{name}_diff_grid') p_flat_theta, _ = self.module.get_global(f'{name}_flat_theta') cuda.memcpy_htod(p_diff_grid, states[1:]) cuda.memcpy_htod( p_flat_theta, np.fromiter(flatten(uker.theta), dtype=np.float32) ) p_p_start, _ = self.module.get_global('p_start') cuda.memcpy_htod( p_p_start, np.array([p.state], dtype=p.dtype) ) timer.toc('calculating launch configuration') ''' GPU kernel execution ''' timer.tic('GPU kernel execution') kernel( graphs_d, diags, scratch_pcg, jobs, starts, gramian, active, gradient if gradient is not None else np.uintp(0), i_job_global, np.uint32(len(jobs)), np.uint32(nX), np.uint32(nY), np.uint32(nJ), np.float32(q), np.float32(q), # placeholder for q0 np.float32(eps), np.float32(ftol), np.float32(gtol), grid=(launch_block_count, 1, 1), block=(self.block_size, 1, 1), shared=shmem_bytes_per_block, ) self.ctx.synchronize() timer.toc('GPU kernel execution')
def zeros(size, dtype=np.float32): return umzeros(size, dtype)