def maxpool_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = MAXPOOL(X) (Using padding, stride and pool kernel size) --> Propagate maximum value in the kernel window """ x_io = node.inputs["X"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) y = y_io.get_data(alloc_map) stride = ( node.get_attr("strides"))[0] # Assuming same stride in all directions padding = ( node.get_attr("pads"))[0] # Assuming same padding in all directions kernel_shape = node.get_attr("kernel_shape") def fn(): with cupy.cuda.Device(node.device_id): cupy.cudnn.pooling_forward( x, y, (kernel_shape[0], kernel_shape[1]), (stride, stride), (padding, padding), cupy.cuda.cudnn.CUDNN_POOLING_MAX, ) return fn
def average_pool_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = AVERAGE_POOL(X) --> CONVE NCHW to NC11 (Average on HW dimensions) """ x_io = node.inputs["X"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) y = y_io.get_data(alloc_map) kernel_size = node.get_attr("kernel_shape") padding = node.get_attr("pads", [0])[0] stride = node.get_attr("strides", [1])[0] def fn(): with cupy.cuda.Device(node.device_id): out = chainer.functions.average_pooling_2d(x, kernel_size, stride=stride, pad=padding).array cupy.copyto(y, out) return fn
def gemm_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = alpha*(X @ W) + beta*b """ x_io = node.inputs["A"] w_io = node.inputs["B"] b_io = node.inputs["C"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) w = w_io.get_data(alloc_map) b = b_io.get_data(alloc_map) y = y_io.get_data(alloc_map) alpha = node.get_attr("alpha", 1.0) beta = node.get_attr("beta", 1.0) transX = node.get_attr("transA", 0) transW = node.get_attr("transB", 0) def fn(): if transX == 1: xt = chainer.functions.transpose(x) else: xt = x if transW == 1: wt = w else: wt = chainer.functions.transpose(w) np.copyto(y, chainer.functions.linear(alpha * xt, wt, b=(beta * b)).array) return fn
def dropout_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: data_io = node.inputs["data"] output_io = node.outputs["output"] opt_mask_io = node.get_output("mask") data = data_io.get_data(alloc_map) output = output_io.get_data(alloc_map) opt_mask = opt_mask_io.get_data(alloc_map) ratio = node.get_attr("ratio", 0.5) def fn(): with cupy.cuda.Device(node.device_id): if opt_mask: o, m = chainer.functions.dropout(data, ratio=ratio, return_mask=True) cupy.copyto(output, o.array) cupy.copyto(opt_mask, m.array) else: cupy.copyto(output, chainer.functions.dropout(data, ratio=ratio).array) return fn
def add_cpu(node: Node, alloc_map: Dict[str, np.ndarray], config: Config) -> Callable[[], None]: """Add Kernel (CPU version) This function creates a kernel which adds two vectors on CPU Z = X + Y Args: node (node): A source node with operator `add` alloc_map (dict): The dictionary of names->allocations Returns: fn: A new kernel Z = X + Y """ if node.get_operator() != "Add": raise ValueError("Node operator should be add, not {}".format( node.get_operator())) x_io = node.inputs["A"] y_io = node.inputs["B"] z_io = node.outputs["C"] x = x_io.get_data(alloc_map) y = y_io.get_data(alloc_map) z = z_io.get_data(alloc_map) def fn(): np.copyto(z, x + y) return fn
def batchnorm_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = gamma * x_hat + beta where: x_hat = (x - r_mean)/sqrt(r_variance + epsilon) & r_mean and r_variance are running mean & variance r_mean = momentum * training_mean + (1 - momentum) * calculated mean r_variance = momentum * training_variance + (1 - momentum) * calculated variance """ x_io = node.inputs["X"] gamma_io = node.inputs["scale"] beta_io = node.inputs["B"] mean_io = node.inputs["mean"] var_io = node.inputs["var"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) gamma = gamma_io.get_data(alloc_map) beta = beta_io.get_data(alloc_map) mean = mean_io.get_data(alloc_map) var = var_io.get_data(alloc_map) y = y_io.get_data(alloc_map) epsilon = node.get_attr("epsilon") momentum = node.get_attr("momentum") spatial = node.get_attr("spatial") if epsilon < 1e-05: epsilon = 1e-05 # modeled off of the chainer support def fn(): with cupy.cuda.Device(node.device_id): #This is a hack to avoid the device to device copy stuck on the default stream cupy.add(cupy.cudnn.batch_normalization_forward_inference( x, gamma, beta, mean, var, epsilon, True, cupy.cuda.cudnn.CUDNN_BATCHNORM_SPATIAL), 0, out=y) #cupy.copyto( # y, # chainer.functions.fixed_batch_normalization( # x, gamma, beta, mean, var, eps=epsilon # ).array, #) return fn
def conv_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ GPU Function: Y = X CONV W (Using padding, stride and dilaton attribute """ x_io = node.inputs["X"] w_io = node.inputs["W"] b_io = node.get_input("B") y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) w = w_io.get_data(alloc_map) b = b_io.get_data(alloc_map) y = y_io.get_data(alloc_map) stride = node.get_attr("strides")[ 0] # Assuming same stride in all directions padding = node.get_attr("pads")[ 0] # Assuming same padding in all directions dilations = node.get_attr("dilations")[ 0] # Assuming same padding in all directions groups = node.get_attr("group", 1) stride = (stride, stride) padding = (padding, padding) dilations = (dilations, dilations) def fn(): # time_st = datetime.datetime.now() # logging.log(logging.INFO, f"CONVOP got --> {x[-1]} CONVOP") with cupy.cuda.Device(node.device_id): cupy.cudnn.convolution_forward(x, w, b, y, padding, stride, dilations, groups, auto_tune=False, tensor_core='auto') # time_end = datetime.datetime.now() # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id}> {time_st} -> {time_end}") # logging.log(logging.INFO, f"CONV sent --> {y[-1]} CONV") return fn
def clip_v6_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: input_io = node.inputs["input"] min_v = node.get_attr("min", -3.402823e38) max_v = node.get_attr("max", 3.402823e38) output_io = node.outputs["output"] inp = input_io.get_data(alloc_map) output = output_io.get_data(alloc_map) def fn(): np.copyto(output, chainer.functions.clip(inp, min_v, max_v).array) return fn
def batchnorm_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = gamma * x_hat + beta where: x_hat = (x - r_mean)/sqrt(r_variance + epsilon) & r_mean and r_variance are running mean & variance r_mean = momentum * training_mean + (1 - momentum) * calculated mean r_variance = momentum * training_variance + (1 - momentum) * calculated variance """ x_io = node.inputs["X"] gamma_io = node.inputs["scale"] beta_io = node.inputs["B"] mean_io = node.inputs["mean"] var_io = node.inputs["var"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) gamma = gamma_io.get_data(alloc_map) beta = beta_io.get_data(alloc_map) mean = mean_io.get_data(alloc_map) var = var_io.get_data(alloc_map) y = y_io.get_data(alloc_map) epsilon = node.get_attr("epsilon", 1e-05) momentum = node.get_attr("momentum", 0.9) spatial = node.get_attr("spatial") if epsilon < 1e-05: epsilon = 1e-05 def fn(): # logging.log(logging.INFO, f"BATCHNORM got --> {x[-1]} BATCHNORM") np.copyto( y, chainer.functions.fixed_batch_normalization(x, gamma, beta, mean, var, eps=epsilon).array, ) return fn
def reduce_mean_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: data_io = node.inputs["data"] reduced_io = node.outputs["reduced"] data = data_io.get_data(alloc_map) reduced = reduced_io.get_data(alloc_map) axes = node.get_attr("axes") keep_dims = node.get_attr("keepdims", 1) == 1 def fn(): np.mean(data, axis=axes, out=reduced, keepdims=keep_dims) return fn
def build_node(onnx_node, io_map, usage_map, node_id): """ Convert an onnx node to an internal node with correctly labeled inputs and outputs as well as the full set of attributes Registers IO usage in the usage map """ input_names = onnx_convert.get_op_input_info(onnx_node.op_type) output_names = onnx_convert.get_op_output_info(onnx_node.op_type) inputs = {} for i, inp in enumerate(onnx_node.input): inputs[input_names[i]] = io_map[inp] # don't add static allocations to the usage map if io_map[inp].kind == "pointer": usage_map[inp]["use"].append(node_id) outputs = {} for i, out in enumerate(onnx_node.output): outputs[output_names[i]] = io_map[out] usage_map[out]["def"].append(node_id) attrs = {} for attr in onnx_node.attribute: attrs[attr.name] = onnx_convert.convert_attr(attr) new_node = Node(node_id, onnx_node.op_type, inputs, outputs, attrs, 0) return new_node
def test_relu(self): c = Config(None, None, 4, 4) io_in = InOut("in", "static", np.array([1, 2, 3, 4]), (4)) io_out = InOut("out", "dynamic", None, (4)) am = {"out": np.ndarray((4))} inp = {"X": io_in} oup = {"Y": io_out} n = Node(0, ops.RELU, inp, oup, {}, 0) fn = kernels.relu_cpu(n, am, c) # eval fn() np.testing.assert_array_equal(io_out.get_data(am), [1, 2, 3, 4]) # copy new static input in np.copyto(io_in.data, [-2, 2, -1, 1]) fn() np.testing.assert_array_equal(io_out.get_data(am), [0, 2, 0, 1]) np.copyto(io_in.data, [-2, -2, -1, -100000]) fn() np.testing.assert_array_equal(io_out.get_data(am), [0, 0, 0, 0])
def build_copy_node(in_io, out_io, node_id): inputs = {"X": in_io} outputs = {"Z": out_io} attrs = {} new_node = Node(node_id, ops.O2P_COPY, inputs, outputs, attrs, 0) return new_node
def clip_v11_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: input_io = node.inputs["input"] min_io = node.get_input("min") max_io = node.get_input("max") output_io = node.outputs["output"] inp = input_io.get_data(alloc_map) min_data = min_io.get_data(alloc_map) if min_data is None: min_data = cupy.array([float("-inf")]) max_data = max_io.get_data(alloc_map) if max_data is None: max_data = cupy.array([float("inf")]) output = output_io.get_data(alloc_map) def fn(): with cupy.cuda.Device(node.device_id): cupy.copyto( output, chainer.functions.clip(inp, min_data[0], max_data[0]).array) return fn
def clip_v6_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: input_io = node.inputs["input"] min_v = node.get_attr("min", -3.402823e38) max_v = node.get_attr("max", 3.402823e38) output_io = node.outputs["output"] inp = input_io.get_data(alloc_map) output = output_io.get_data(alloc_map) def fn(): with cupy.cuda.Device(node.device_id): cupy.clip(inp, a_min=min_v, a_max=max_v, out=output) return fn
def opt_simple_model_para(graph: nx.DiGraph, alloc_map, config: Config) -> None: graph.add_node(PNO_GRAPH_HEAD_ID) graph.add_edge(PNO_GRAPH_HEAD_ID, 0) graph.nodes[PNO_GRAPH_HEAD_ID]["node"] = Node(-1, ops.O2P_GRAPH_HEAD, {}, {}, {}, 0) cuda_devices = get_valid_cuda_devices() num_cuda = len(cuda_devices) total_nodes = graph.number_of_nodes() # HACK split_len = total_nodes // 5 for gnode in graph.nodes: node = graph.nodes[gnode]["node"] if node.device_type == "gpu": node.device_id = node.node_id // split_len # HACKY SAFETY if node.device_id > num_cuda - 1: node.device_id = num_cuda - 1 return
def maxpool_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = MAXPOOL(X) (Using padding, stride and pool kernel size) --> Propagate maximum value in the kernel window """ x_io = node.inputs["X"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) y = y_io.get_data(alloc_map) # Assume same stride in all directions stride = (node.get_attr("strides", [1]))[0] # Assume same padding in all directions padding = (node.get_attr("pads", [0]))[0] kernel_shape = node.get_attr("kernel_shape") def fn(): # time_st = datetime.datetime.now() x_pad = np.pad( x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode="constant", constant_values=0, ) batches, c, h, w = x.shape out_h = np.floor(((h - kernel_shape[0] + 2 * padding) / stride) + 1).astype(int) out_w = np.floor(((w - kernel_shape[1] + 2 * padding) / stride) + 1).astype(int) out = np.zeros((batches, c, out_h, out_w)) for i in range(batches): for j in range(c): for p in range(out_h): for q in range(out_w): p0, p1 = p * stride, (p * stride) + kernel_shape[0] q0, q1 = q * stride, (q * stride) + kernel_shape[1] out[i, j, p, q] = np.max(x_pad[i, j, p0:p1, q0:q1]) np.copyto(y, out) # time_end = datetime.datetime.now() # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id}> {time_st} -> {time_end}") return fn
def build_store_node(target, io_map, usage_map, node_id): """ Build a new store node and log usage in the map """ inputs = {"X": io_map[target]} outputs = {} attrs = {"store_id": 0} new_node = Node(node_id, ops.O2P_STORE, inputs, outputs, attrs, 0) usage_map[target]["use"].append(node_id) return new_node
def build_load_node(target, io_map, usage_map, node_id): """ Build a new load node and log usage in the map """ inputs = {} outputs = {"Z": io_map[target]} attrs = {"batch_id": 0} new_node = Node(node_id, ops.O2P_LOAD, inputs, outputs, attrs, 0) usage_map[target]["def"].append(node_id) return new_node
def gemm_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = alpha*(X @ W) + beta*b """ x_io = node.inputs["A"] w_io = node.inputs["B"] b_io = node.inputs["C"] y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) w = w_io.get_data(alloc_map) b = b_io.get_data(alloc_map) y = y_io.get_data(alloc_map) alpha = node.get_attr("alpha", 1.0) beta = node.get_attr("beta", 1.0) transX = node.get_attr("transA", 0) transW = node.get_attr("transB", 0) def fn(): with cupy.cuda.Device(node.device_id): if transX == 1: #xt = chainer.functions.transpose(x) xt = cupy.transpose(x) else: xt = x if transW == 1: #wt = chainer.functions.transpose(w) wt = cupy.transpose(w) else: wt = w z = cupy.dot(alpha * xt, wt) cupy.add(z, beta * b, out=y) return fn
def conv_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: """ Function: Y = X CONV W (Using padding, stride and dilaton attribute """ x_io = node.inputs["X"] w_io = node.inputs["W"] b_io = node.get_input("B") y_io = node.outputs["Y"] x = x_io.get_data(alloc_map) w = w_io.get_data(alloc_map) b = b_io.get_data(alloc_map) y = y_io.get_data(alloc_map) # Assuming same stride in all directions stride = node.get_attr("strides", [1])[0] # Assuming same padding in all directions padding = node.get_attr("pads", [0])[0] dilations = node.get_attr( "dilations", [1])[0] # Assuming same padding in all directions groups = node.get_attr("group", 1) def fn(): np.copyto( y, (chainer.functions.convolution_2d( x, w, b=b, stride=stride, pad=padding, dilate=dilations, groups=groups, )).array, ) return fn
def build_replicated_node(other, node_id, instance_id, suffix): inputs = {} for inp_name, inp_io in other.inputs.items(): inputs[inp_name] = build_replicated_io(inp_io, suffix) outputs = {} for outp_name, outp_io in other.outputs.items(): outputs[outp_name] = build_replicated_io(outp_io, suffix) attrs = {} for attr_name, attr_v in other.attrs.items(): attrs[attr_name] = attr_v new_node = Node(node_id, other.operator, inputs, outputs, attrs, instance_id) return new_node
def test_conv_stride(self): c = Config(None, None, 4, 4) io_in = InOut("in", "static", np.ndarray((4, 3, 22, 22)), (4, 3, 22, 22)) io_kern = InOut("kern", "static", np.ndarray((1, 3, 3, 3)), (1, 3, 3, 3)) io_bias = InOut("bias", "static", np.ndarray((1)), (1)) io_out = InOut("out", "dynamic", None, (4, 1, 10, 10)) i = np.random.random(np.shape(io_in.data)) w = np.random.random(np.shape(io_kern.data)) b = np.random.random(np.shape(io_bias.data)) np.copyto(io_in.data, i) np.copyto(io_kern.data, w) np.copyto(io_bias.data, b) # ---TEST 3: X,W,B default attrs am = {"out": np.ndarray((4, 1, 10, 10))} inp = {"X": io_in, "W": io_kern, "B": io_bias} oup = {"Y": io_out} attrs = { "dilations": (1, 1), "group": (1), "kernel_shape": (3, 3), "pads": (0, 0, 0, 0), "strides": (2, 2, 2, 2), } n = Node(0, ops.CONV, inp, oup, attrs, 0) fn = kernels.conv_cpu(n, am, c) # chainer with previous config o = chainer.functions.convolution_2d( i, w, b=b, stride=(2, 2), pad=(0, 0), dilate=(1, 1), groups=1 ).array fn() np.testing.assert_array_almost_equal(o, io_out.get_data(am))
def test_maxpool_big_stride(self): B = 4 C = 4 H = 22 W = 22 K_size = (3, 3) in_shape = (B, C, H, W) out_shape = (B, C, 7, 7) c = Config(None, None, B, B) io_in = InOut("in", "static", np.ndarray(in_shape), in_shape) io_out = InOut("out", "dynamic", None, out_shape) i = np.random.random(np.shape(io_in.data)) np.copyto(io_in.data, i) ref_mod = torch.nn.MaxPool2d( K_size, stride=3, dilation=1, padding=0, ceil_mode=False ) torch_i = torch.tensor(i) ref = ref_mod(torch_i).numpy() am = {"out": np.ndarray(out_shape)} inp = {"X": io_in} oup = {"Y": io_out} attrs = {"kernel_shape": K_size, "strides": (3, 3, 3, 3)} n = Node(0, ops.MAXPOOL, inp, oup, attrs, 0) test_fn = kernels.maxpool_cpu(n, am, c) test_fn() np.testing.assert_array_almost_equal(io_out.get_data(am), ref)
def test_copy(self): c = Config(None, None, 4, 4) size = (4,3,224,224) io_in = InOut("in", "static", np.random.random(size), size) io_gpu = InOut("gpu", "dynamic", None, size) io_return = InOut("return", "dynamic", None, size) with cupy.cuda.Device(0): gpu_buffer = cupy.ndarray((size)) am = {"gpu": gpu_buffer, "return": np.ndarray((size))} inp_c0 = {"X": io_in} oup_c0 = {"Z": io_gpu} inp_c1 = {"X": io_gpu} oup_c1 = {"Z": io_return} c0 = Node(0, ops.O2P_COPY, inp_c0, oup_c0, {}, 0) c0.device_id = 0 c1 = Node(0, ops.O2P_COPY, inp_c1, oup_c1, {}, 0) c1.device_id = 0 fn_c0 = kernels.copy(c0, am, c) fn_c1 = kernels.copy(c1, am, c) #copy to gpu fn_c0() #execute +1 cupy.copyto(gpu_buffer,gpu_buffer + 1) #copy back fn_c1() ref_plus_one = io_in.get_data(am) + 1 cupy.testing.assert_array_equal(io_gpu.get_data(am), ref_plus_one) np.testing.assert_equal(io_return.get_data(am), ref_plus_one)
def clip_v11_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]: input_io = node.inputs["input"] min_io = node.get_input("min") max_io = node.get_input("max") output_io = node.outputs["output"] inp = input_io.get_data(alloc_map) min_data = min_io.get_data(alloc_map) if min_data is None: min_data = [-np.inf] max_data = max_io.get_data(alloc_map) if max_data is None: max_data = [np.inf] output = output_io.get_data(alloc_map) def fn(): np.copyto(output, chainer.functions.clip(inp, min_data[0], max_data[0]).array) return fn
def build_kernel(node: Node, alloc_map: Dict[str, np.ndarray], config: Config) -> Callable[[], None]: """ For each node in graph build a function for execution on the correct device """ oper = node.get_operator() if oper == ops.ADD: if node.device_type == "cpu": return kernels.add_cpu(node, alloc_map, config) else: return kernels.add_gpu(node, alloc_map, config) if oper == ops.O2P_LOAD: return kernels.load_cpu(node, alloc_map, config) if oper == ops.O2P_STORE: return kernels.store_cpu(node, alloc_map, config) if oper == ops.O2P_COPY: return kernels.copy(node, alloc_map, config) if oper == ops.CONV: if node.device_type == "cpu": return kernels.conv_cpu(node, alloc_map, config) else: return kernels.conv_gpu(node, alloc_map, config) if oper == ops.BATCH_NORM: if node.device_type == "cpu": return kernels.batchnorm_cpu(node, alloc_map, config) else: return kernels.batchnorm_gpu(node, alloc_map, config) if oper == ops.RELU: if node.device_type == "cpu": return kernels.relu_cpu(node, alloc_map, config) else: return kernels.relu_gpu(node, alloc_map, config) if oper == ops.MAXPOOL: if node.device_type == "cpu": return kernels.maxpool_cpu(node, alloc_map, config) else: return kernels.maxpool_gpu(node, alloc_map, config) if oper == ops.GLOBALAVERAGEPOOL: if node.device_type == "cpu": return kernels.globalAveragePool_cpu(node, alloc_map, config) else: return kernels.globalAveragePool_gpu(node, alloc_map, config) if oper == ops.AVERAGE_POOL: if node.device_type == "cpu": return kernels.average_pool_cpu(node, alloc_map, config) else: return kernels.average_pool_gpu(node, alloc_map, config) if oper == ops.PAD: if node.device_type == "cpu": return kernels.pad_cpu(node, alloc_map, config) else: raise NotImplementedError() if oper == ops.FLATTEN: if node.device_type == "cpu": return kernels.flatten_cpu(node, alloc_map, config) else: return kernels.flatten_gpu(node, alloc_map, config) if oper == ops.RESHAPE: if node.device_type == "cpu": return kernels.reshape_cpu(node, alloc_map, config) else: return kernels.reshape_gpu(node, alloc_map, config) if oper == ops.GEMM: if node.device_type == "cpu": return kernels.gemm_cpu(node, alloc_map, config) else: return kernels.gemm_gpu(node, alloc_map, config) if oper == ops.DROPOUT: if node.device_type == "cpu": return kernels.dropout_cpu(node, alloc_map, config) else: return kernels.dropout_gpu(node, alloc_map, config) if oper == ops.CLIP: if node.device_type == "cpu": return kernels.clip_v6_cpu(node, alloc_map, config) else: return kernels.clip_v6_gpu(node, alloc_map, config) if oper == ops.REDUCE_MEAN: if node.device_type == "cpu": return kernels.reduce_mean_cpu(node, alloc_map, config) else: return kernels.reduce_mean_gpu(node, alloc_map, config) if oper == ops.O2P_GRAPH_HEAD: return None raise ValueError(f"Operator {oper} not supported")
def test_batchnorm_defaults(self): B = 4 C = 4 H = 22 W = 22 K_size = (3, 3) in_shape = (B, C, H, W) out_shape = (B, C, H, W) c = Config(None, None, B, B) io_in = InOut("in", "static", np.ndarray(in_shape), in_shape) io_scale = InOut("scale", "static", np.ndarray((C)), (C)) io_B = InOut("B", "static", np.ndarray((C)), (C)) io_mean = InOut("mean", "static", np.ndarray((C)), (C)) io_var = InOut("var", "static", np.ndarray((C)), (C)) io_out = InOut("out", "dynamic", None, out_shape) np.random.seed(123) i = np.random.random(np.shape(io_in.data)) s = np.random.random(np.shape(io_scale.data)) b = np.random.random(np.shape(io_B.data)) mean = np.random.random(np.shape(io_mean.data)) var = np.random.random(np.shape(io_var.data)) np.copyto(io_in.data, i) np.copyto(io_scale.data, s) np.copyto(io_B.data, b) np.copyto(io_mean.data, mean) np.copyto(io_var.data, var) eps = 1e-05 momentum_torch = 0.5 momentum_test = 0.4 torch_i = torch.tensor(i) torch_w = torch.tensor(s) torch_b = torch.tensor(b) torch_mean = torch.tensor(mean) torch_var = torch.tensor(var) ref = torch.nn.functional.batch_norm( torch_i, torch_mean, torch_var, weight=torch_w, bias=torch_b, training=False, momentum=momentum_torch, eps=eps, ).numpy() ref_chainer = chainer.functions.fixed_batch_normalization( i, s, b, mean, var, eps=eps, ).array am = {"out": np.ndarray(out_shape)} inp = {"X": io_in, "scale": io_scale, "B": io_B, "mean": io_mean, "var": io_var} oup = {"Y": io_out} attrs = {"epsilon": eps, "momentum": momentum_test} n = Node(0, ops.BATCH_NORM, inp, oup, attrs, 0) test_fn = kernels.batchnorm_cpu(n, am, c) test_fn() #np.testing.assert_array_almost_equal(ref, ref_chainer) np.testing.assert_array_almost_equal(io_out.get_data(am), ref_chainer)
def opt_graph_split(graph: nx.DiGraph, alloc_map: Dict[str, np.ndarray], config: Config) -> None: # add the generic head node to the graph # connect it to the initial root node generated by frontend graph.add_node(PNO_GRAPH_HEAD_ID) graph.add_edge(PNO_GRAPH_HEAD_ID, 0) graph.nodes[PNO_GRAPH_HEAD_ID]["node"] = Node(-1, ops.O2P_GRAPH_HEAD, {}, {}, {}, 0) # need to rename and assign to the correct device cuda_devices = get_valid_cuda_devices() num_cuda = len(cuda_devices) config.computed_batch_size = num_cuda * config.user_width # there is now +1 node in the graph because of the -1 head new_gnode = graph.number_of_nodes() - 1 if num_cuda > 0: # compute the correct split # gpu_name_maps = [{}] * num_cuda gpu_name_maps = [{} for i in range(num_cuda)] # source_gnode -> local_gnode # add a mapping from og graph head to graph head for all devices for i in range(num_cuda): gpu_name_maps[i][PNO_GRAPH_HEAD_ID] = PNO_GRAPH_HEAD_ID # start at the initial node of the non-replicated graph fixed_list = list(nx.topological_sort(graph)) # skip the HEAD node for source_gnode in fixed_list[1:]: source_node = graph.nodes[source_gnode]["node"] gparents = list(graph.predecessors(source_gnode)) for gpu_idx, device_id in enumerate(cuda_devices): device_node = build_replicated_node(source_node, new_gnode, gpu_idx, f"_g{device_id}") # configure device settings for the new node if source_node.device_type == "gpu": device_node.device_type = "gpu" device_node.device_id = device_id else: device_node.device_type = "cpu" device_node.device_id = 0 graph.add_node(new_gnode) graph.nodes[new_gnode]["node"] = device_node # look up source node parent in gpu_name_maps for gparent in gparents: edge_source = gpu_name_maps[gpu_idx][gparent] graph.add_edge(edge_source, new_gnode) # add ourself to the gpu name map gpu_name_maps[gpu_idx][source_gnode] = new_gnode new_gnode += 1 # remove the og graph for gnode in fixed_list[1:]: graph.remove_node(gnode) return
def copy(node: Node, alloc_map, config: Config): x_io = node.inputs["X"] z_io = node.outputs["Z"] x = x_io.get_data(alloc_map) z = z_io.get_data(alloc_map) source_device_id = node.get_attr("source_device")[1] target_device_id = node.get_attr("target_device")[1] tz = type(z) tx = type(x) def fn(): # time_st = datetime.datetime.now() if tz == numpy.ndarray: # to cpu np.copyto(z, cupy.asnumpy(x)) # assert cupy.testing.assert_array_equal(z,x) if tz == cupy.core.core.ndarray and tx != cupy.core.core.ndarray: # to gpu with cupy.cuda.Device(node.device_id): cupy.add(cupy.asarray(x), 0, out=z) if tz == cupy.core.core.ndarray and tx == cupy.core.core.ndarray: # to gpu tmp = None with cupy.cuda.Device(source_device_id): tmp = cupy.asnumpy(x) with cupy.cuda.Device(target_device_id): cupy.copyto(z, cupy.asarray(tmp)) # assert cupy.testing.assert_array_equal(z,x) # assert z.shape == x.shape # cupy.cuda.get_current_stream().synchronize() # tmp = cupy.asarray(x) # cupy.cuda.get_current_stream().synchronize() # neq = cupy.count_nonzero(cupy.logical_not(z==tmp)) # print(neq) # assert cupy.testing.assert_array_equal(z,tmp) # to gpu: # og_shape = x.shape # if tz == numpy.ndarray: # to cpu # with cupy.cuda.Device(device=node.device_id): # arr_flat = x.reshape((-1)) # z_flat = np.ndarray(arr_flat.shape) # for i, v in enumerate(arr_flat): # z_flat[i] = v # z_flat = z_flat.reshape(og_shape) # np.copyto(z,z_flat) # if tz == cupy.core.core.ndarray: # arr_flat = x.reshape((-1)) # with cupy.cuda.Device(device=node.device_id): # z_flat = cupy.ndarray(arr_flat.shape) # for i, v in enumerate(arr_flat): # z_flat[i] = v # z_flat = z_flat.reshape(og_shape) # cupy.copyto(z,z_flat) # time_end = datetime.datetime.now() # logging.log(logging.INFO, f"done copy {z}, {tz}") # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id} {time_st} -> {time_end}") return fn