def add_cpu(node: Node, alloc_map: Dict[str, np.ndarray], config: Config) -> Callable[[], None]: """Add Kernel (CPU version) This function creates a kernel which adds two vectors on CPU Z = X + Y Args: node (node): A source node with operator `add` alloc_map (dict): The dictionary of names->allocations Returns: fn: A new kernel Z = X + Y """ if node.get_operator() != "Add": raise ValueError("Node operator should be add, not {}".format( node.get_operator())) x_io = node.inputs["A"] y_io = node.inputs["B"] z_io = node.outputs["C"] x = x_io.get_data(alloc_map) y = y_io.get_data(alloc_map) z = z_io.get_data(alloc_map) def fn(): np.copyto(z, x + y) return fn
def build_kernel(node: Node, alloc_map: Dict[str, np.ndarray], config: Config) -> Callable[[], None]: """ For each node in graph build a function for execution on the correct device """ oper = node.get_operator() if oper == ops.ADD: if node.device_type == "cpu": return kernels.add_cpu(node, alloc_map, config) else: return kernels.add_gpu(node, alloc_map, config) if oper == ops.O2P_LOAD: return kernels.load_cpu(node, alloc_map, config) if oper == ops.O2P_STORE: return kernels.store_cpu(node, alloc_map, config) if oper == ops.O2P_COPY: return kernels.copy(node, alloc_map, config) if oper == ops.CONV: if node.device_type == "cpu": return kernels.conv_cpu(node, alloc_map, config) else: return kernels.conv_gpu(node, alloc_map, config) if oper == ops.BATCH_NORM: if node.device_type == "cpu": return kernels.batchnorm_cpu(node, alloc_map, config) else: return kernels.batchnorm_gpu(node, alloc_map, config) if oper == ops.RELU: if node.device_type == "cpu": return kernels.relu_cpu(node, alloc_map, config) else: return kernels.relu_gpu(node, alloc_map, config) if oper == ops.MAXPOOL: if node.device_type == "cpu": return kernels.maxpool_cpu(node, alloc_map, config) else: return kernels.maxpool_gpu(node, alloc_map, config) if oper == ops.GLOBALAVERAGEPOOL: if node.device_type == "cpu": return kernels.globalAveragePool_cpu(node, alloc_map, config) else: return kernels.globalAveragePool_gpu(node, alloc_map, config) if oper == ops.AVERAGE_POOL: if node.device_type == "cpu": return kernels.average_pool_cpu(node, alloc_map, config) else: return kernels.average_pool_gpu(node, alloc_map, config) if oper == ops.PAD: if node.device_type == "cpu": return kernels.pad_cpu(node, alloc_map, config) else: raise NotImplementedError() if oper == ops.FLATTEN: if node.device_type == "cpu": return kernels.flatten_cpu(node, alloc_map, config) else: return kernels.flatten_gpu(node, alloc_map, config) if oper == ops.RESHAPE: if node.device_type == "cpu": return kernels.reshape_cpu(node, alloc_map, config) else: return kernels.reshape_gpu(node, alloc_map, config) if oper == ops.GEMM: if node.device_type == "cpu": return kernels.gemm_cpu(node, alloc_map, config) else: return kernels.gemm_gpu(node, alloc_map, config) if oper == ops.DROPOUT: if node.device_type == "cpu": return kernels.dropout_cpu(node, alloc_map, config) else: return kernels.dropout_gpu(node, alloc_map, config) if oper == ops.CLIP: if node.device_type == "cpu": return kernels.clip_v6_cpu(node, alloc_map, config) else: return kernels.clip_v6_gpu(node, alloc_map, config) if oper == ops.REDUCE_MEAN: if node.device_type == "cpu": return kernels.reduce_mean_cpu(node, alloc_map, config) else: return kernels.reduce_mean_gpu(node, alloc_map, config) if oper == ops.O2P_GRAPH_HEAD: return None raise ValueError(f"Operator {oper} not supported")