Example #1
0
def maxpool_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
                Y = MAXPOOL(X) (Using padding, stride and pool kernel size)
                --> Propagate maximum value in the kernel window
    """

    x_io = node.inputs["X"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    stride = (
        node.get_attr("strides"))[0]  # Assuming same stride in all directions
    padding = (
        node.get_attr("pads"))[0]  # Assuming same padding in all directions
    kernel_shape = node.get_attr("kernel_shape")

    def fn():
        with cupy.cuda.Device(node.device_id):
            cupy.cudnn.pooling_forward(
                x,
                y,
                (kernel_shape[0], kernel_shape[1]),
                (stride, stride),
                (padding, padding),
                cupy.cuda.cudnn.CUDNN_POOLING_MAX,
            )

    return fn
Example #2
0
def average_pool_gpu(node: Node, alloc_map,
                     config: Config) -> Callable[[], None]:
    """
        Function:
                Y = AVERAGE_POOL(X)
            --> CONVE NCHW to NC11 (Average on HW dimensions)
    """

    x_io = node.inputs["X"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    kernel_size = node.get_attr("kernel_shape")
    padding = node.get_attr("pads", [0])[0]
    stride = node.get_attr("strides", [1])[0]

    def fn():
        with cupy.cuda.Device(node.device_id):
            out = chainer.functions.average_pooling_2d(x,
                                                       kernel_size,
                                                       stride=stride,
                                                       pad=padding).array
            cupy.copyto(y, out)

    return fn
Example #3
0
def gemm_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
                Y = alpha*(X @ W) + beta*b
    """

    x_io = node.inputs["A"]
    w_io = node.inputs["B"]
    b_io = node.inputs["C"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    w = w_io.get_data(alloc_map)
    b = b_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    alpha = node.get_attr("alpha", 1.0)
    beta = node.get_attr("beta", 1.0)
    transX = node.get_attr("transA", 0)
    transW = node.get_attr("transB", 0)

    def fn():
        if transX == 1:
            xt = chainer.functions.transpose(x)
        else:
            xt = x
        if transW == 1:
            wt = w
        else:
            wt = chainer.functions.transpose(w)

        np.copyto(y,
                  chainer.functions.linear(alpha * xt, wt, b=(beta * b)).array)

    return fn
Example #4
0
def dropout_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:

    data_io = node.inputs["data"]
    output_io = node.outputs["output"]
    opt_mask_io = node.get_output("mask")

    data = data_io.get_data(alloc_map)
    output = output_io.get_data(alloc_map)
    opt_mask = opt_mask_io.get_data(alloc_map)

    ratio = node.get_attr("ratio", 0.5)

    def fn():
        with cupy.cuda.Device(node.device_id):
            if opt_mask:
                o, m = chainer.functions.dropout(data,
                                                 ratio=ratio,
                                                 return_mask=True)
                cupy.copyto(output, o.array)
                cupy.copyto(opt_mask, m.array)
            else:
                cupy.copyto(output,
                            chainer.functions.dropout(data, ratio=ratio).array)

    return fn
Example #5
0
def add_cpu(node: Node, alloc_map: Dict[str, np.ndarray],
            config: Config) -> Callable[[], None]:
    """Add Kernel (CPU version)

    This function creates a kernel which adds two vectors on CPU

    Z = X + Y

    Args:
        node (node): A source node with operator `add`
        alloc_map (dict): The dictionary of names->allocations

    Returns:
        fn: A new kernel Z = X + Y
    """

    if node.get_operator() != "Add":
        raise ValueError("Node operator should be add, not {}".format(
            node.get_operator()))

    x_io = node.inputs["A"]
    y_io = node.inputs["B"]
    z_io = node.outputs["C"]

    x = x_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)
    z = z_io.get_data(alloc_map)

    def fn():
        np.copyto(z, x + y)

    return fn
Example #6
0
def batchnorm_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
                Y = gamma * x_hat + beta
                        where:
                                x_hat = (x - r_mean)/sqrt(r_variance + epsilon)
                        & r_mean and r_variance are running mean & variance

                                r_mean = momentum * training_mean + (1 - momentum) * calculated mean
                                r_variance = momentum * training_variance + (1 - momentum) * calculated variance
    """

    x_io = node.inputs["X"]
    gamma_io = node.inputs["scale"]
    beta_io = node.inputs["B"]
    mean_io = node.inputs["mean"]
    var_io = node.inputs["var"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    gamma = gamma_io.get_data(alloc_map)
    beta = beta_io.get_data(alloc_map)
    mean = mean_io.get_data(alloc_map)
    var = var_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    epsilon = node.get_attr("epsilon")
    momentum = node.get_attr("momentum")
    spatial = node.get_attr("spatial")
    if epsilon < 1e-05:
        epsilon = 1e-05

    # modeled off of the chainer support

    def fn():
        with cupy.cuda.Device(node.device_id):

            #This is a hack to avoid the device to device copy stuck on the default stream
            cupy.add(cupy.cudnn.batch_normalization_forward_inference(
                x, gamma, beta, mean, var, epsilon, True,
                cupy.cuda.cudnn.CUDNN_BATCHNORM_SPATIAL),
                     0,
                     out=y)

            #cupy.copyto(
            #    y,
            #    chainer.functions.fixed_batch_normalization(
            #        x, gamma, beta, mean, var, eps=epsilon
            #    ).array,
            #)

    return fn
Example #7
0
def conv_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        GPU Function:
                Y = X CONV W (Using padding, stride and dilaton attribute
    """
    x_io = node.inputs["X"]
    w_io = node.inputs["W"]
    b_io = node.get_input("B")
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    w = w_io.get_data(alloc_map)
    b = b_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    stride = node.get_attr("strides")[
        0]  # Assuming same stride in all directions
    padding = node.get_attr("pads")[
        0]  # Assuming same padding in all directions
    dilations = node.get_attr("dilations")[
        0]  # Assuming same padding in all directions
    groups = node.get_attr("group", 1)

    stride = (stride, stride)
    padding = (padding, padding)
    dilations = (dilations, dilations)

    def fn():
        # time_st = datetime.datetime.now()
        # logging.log(logging.INFO, f"CONVOP got -->  {x[-1]} CONVOP")

        with cupy.cuda.Device(node.device_id):

            cupy.cudnn.convolution_forward(x,
                                           w,
                                           b,
                                           y,
                                           padding,
                                           stride,
                                           dilations,
                                           groups,
                                           auto_tune=False,
                                           tensor_core='auto')

        # time_end = datetime.datetime.now()
        # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id}> {time_st} -> {time_end}")
        # logging.log(logging.INFO, f"CONV sent -->  {y[-1]} CONV")

    return fn
Example #8
0
def clip_v6_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:

    input_io = node.inputs["input"]
    min_v = node.get_attr("min", -3.402823e38)
    max_v = node.get_attr("max", 3.402823e38)

    output_io = node.outputs["output"]

    inp = input_io.get_data(alloc_map)
    output = output_io.get_data(alloc_map)

    def fn():
        np.copyto(output, chainer.functions.clip(inp, min_v, max_v).array)

    return fn
Example #9
0
def batchnorm_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
        Y = gamma * x_hat + beta
        where:
            x_hat = (x - r_mean)/sqrt(r_variance + epsilon)
        & r_mean and r_variance are running mean & variance

            r_mean = momentum * training_mean
                     + (1 - momentum) * calculated mean
            r_variance = momentum * training_variance
                         + (1 - momentum) * calculated variance
    """

    x_io = node.inputs["X"]
    gamma_io = node.inputs["scale"]
    beta_io = node.inputs["B"]
    mean_io = node.inputs["mean"]
    var_io = node.inputs["var"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    gamma = gamma_io.get_data(alloc_map)
    beta = beta_io.get_data(alloc_map)
    mean = mean_io.get_data(alloc_map)
    var = var_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    epsilon = node.get_attr("epsilon", 1e-05)
    momentum = node.get_attr("momentum", 0.9)
    spatial = node.get_attr("spatial")
    if epsilon < 1e-05:
        epsilon = 1e-05

    def fn():

        # logging.log(logging.INFO, f"BATCHNORM got -->  {x[-1]} BATCHNORM")
        np.copyto(
            y,
            chainer.functions.fixed_batch_normalization(x,
                                                        gamma,
                                                        beta,
                                                        mean,
                                                        var,
                                                        eps=epsilon).array,
        )

    return fn
Example #10
0
def reduce_mean_cpu(node: Node, alloc_map,
                    config: Config) -> Callable[[], None]:

    data_io = node.inputs["data"]
    reduced_io = node.outputs["reduced"]

    data = data_io.get_data(alloc_map)
    reduced = reduced_io.get_data(alloc_map)

    axes = node.get_attr("axes")
    keep_dims = node.get_attr("keepdims", 1) == 1

    def fn():
        np.mean(data, axis=axes, out=reduced, keepdims=keep_dims)

    return fn
Example #11
0
def build_node(onnx_node, io_map, usage_map, node_id):

    """
    Convert an onnx node to an internal node
    with correctly labeled inputs and outputs as well as
    the full set of attributes

    Registers IO usage in the usage map
    """

    input_names = onnx_convert.get_op_input_info(onnx_node.op_type)
    output_names = onnx_convert.get_op_output_info(onnx_node.op_type)

    inputs = {}
    for i, inp in enumerate(onnx_node.input):
        inputs[input_names[i]] = io_map[inp]

        # don't add static allocations to the usage map
        if io_map[inp].kind == "pointer":
            usage_map[inp]["use"].append(node_id)

    outputs = {}
    for i, out in enumerate(onnx_node.output):
        outputs[output_names[i]] = io_map[out]
        usage_map[out]["def"].append(node_id)

    attrs = {}
    for attr in onnx_node.attribute:
        attrs[attr.name] = onnx_convert.convert_attr(attr)

    new_node = Node(node_id, onnx_node.op_type, inputs, outputs, attrs, 0)

    return new_node
Example #12
0
    def test_relu(self):

        c = Config(None, None, 4, 4)

        io_in = InOut("in", "static", np.array([1, 2, 3, 4]), (4))

        io_out = InOut("out", "dynamic", None, (4))

        am = {"out": np.ndarray((4))}
        inp = {"X": io_in}
        oup = {"Y": io_out}

        n = Node(0, ops.RELU, inp, oup, {}, 0)

        fn = kernels.relu_cpu(n, am, c)

        # eval
        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [1, 2, 3, 4])

        # copy new static input in
        np.copyto(io_in.data, [-2, 2, -1, 1])

        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [0, 2, 0, 1])

        np.copyto(io_in.data, [-2, -2, -1, -100000])

        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [0, 0, 0, 0])
Example #13
0
def build_copy_node(in_io, out_io, node_id):
    inputs = {"X": in_io}
    outputs = {"Z": out_io}
    attrs = {}
    new_node = Node(node_id, ops.O2P_COPY, inputs, outputs, attrs, 0)

    return new_node
Example #14
0
def clip_v11_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:

    input_io = node.inputs["input"]
    min_io = node.get_input("min")
    max_io = node.get_input("max")

    output_io = node.outputs["output"]

    inp = input_io.get_data(alloc_map)
    min_data = min_io.get_data(alloc_map)
    if min_data is None:
        min_data = cupy.array([float("-inf")])

    max_data = max_io.get_data(alloc_map)
    if max_data is None:
        max_data = cupy.array([float("inf")])

    output = output_io.get_data(alloc_map)

    def fn():
        with cupy.cuda.Device(node.device_id):
            cupy.copyto(
                output,
                chainer.functions.clip(inp, min_data[0], max_data[0]).array)

    return fn
Example #15
0
def clip_v6_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:

    input_io = node.inputs["input"]
    min_v = node.get_attr("min", -3.402823e38)
    max_v = node.get_attr("max", 3.402823e38)

    output_io = node.outputs["output"]

    inp = input_io.get_data(alloc_map)
    output = output_io.get_data(alloc_map)

    def fn():
        with cupy.cuda.Device(node.device_id):
            cupy.clip(inp, a_min=min_v, a_max=max_v, out=output)

    return fn
Example #16
0
def opt_simple_model_para(graph: nx.DiGraph, alloc_map,
                          config: Config) -> None:

    graph.add_node(PNO_GRAPH_HEAD_ID)
    graph.add_edge(PNO_GRAPH_HEAD_ID, 0)

    graph.nodes[PNO_GRAPH_HEAD_ID]["node"] = Node(-1, ops.O2P_GRAPH_HEAD, {},
                                                  {}, {}, 0)

    cuda_devices = get_valid_cuda_devices()
    num_cuda = len(cuda_devices)

    total_nodes = graph.number_of_nodes()

    # HACK
    split_len = total_nodes // 5

    for gnode in graph.nodes:
        node = graph.nodes[gnode]["node"]
        if node.device_type == "gpu":
            node.device_id = node.node_id // split_len
            # HACKY SAFETY
            if node.device_id > num_cuda - 1:
                node.device_id = num_cuda - 1

    return
Example #17
0
def maxpool_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
                Y = MAXPOOL(X) (Using padding, stride and pool kernel size)
            --> Propagate maximum value in the kernel window
    """

    x_io = node.inputs["X"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    # Assume same stride in all directions
    stride = (node.get_attr("strides", [1]))[0]
    # Assume same padding in all directions
    padding = (node.get_attr("pads", [0]))[0]
    kernel_shape = node.get_attr("kernel_shape")

    def fn():
        # time_st = datetime.datetime.now()
        x_pad = np.pad(
            x,
            ((0, 0), (0, 0), (padding, padding), (padding, padding)),
            mode="constant",
            constant_values=0,
        )
        batches, c, h, w = x.shape
        out_h = np.floor(((h - kernel_shape[0] + 2 * padding) / stride) +
                         1).astype(int)
        out_w = np.floor(((w - kernel_shape[1] + 2 * padding) / stride) +
                         1).astype(int)
        out = np.zeros((batches, c, out_h, out_w))
        for i in range(batches):
            for j in range(c):
                for p in range(out_h):
                    for q in range(out_w):
                        p0, p1 = p * stride, (p * stride) + kernel_shape[0]
                        q0, q1 = q * stride, (q * stride) + kernel_shape[1]
                        out[i, j, p, q] = np.max(x_pad[i, j, p0:p1, q0:q1])
        np.copyto(y, out)
        # time_end = datetime.datetime.now()
        # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id}> {time_st} -> {time_end}")

    return fn
Example #18
0
def build_store_node(target, io_map, usage_map, node_id):

    """
    Build a new store node and log usage in the map
    """

    inputs = {"X": io_map[target]}
    outputs = {}
    attrs = {"store_id": 0}
    new_node = Node(node_id, ops.O2P_STORE, inputs, outputs, attrs, 0)
    usage_map[target]["use"].append(node_id)

    return new_node
Example #19
0
def build_load_node(target, io_map, usage_map, node_id):

    """
    Build a new load node and log usage in the map
    """

    inputs = {}
    outputs = {"Z": io_map[target]}
    attrs = {"batch_id": 0}
    new_node = Node(node_id, ops.O2P_LOAD, inputs, outputs, attrs, 0)
    usage_map[target]["def"].append(node_id)

    return new_node
Example #20
0
def gemm_gpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
                Y = alpha*(X @ W) + beta*b
    """

    x_io = node.inputs["A"]
    w_io = node.inputs["B"]
    b_io = node.inputs["C"]
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    w = w_io.get_data(alloc_map)
    b = b_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    alpha = node.get_attr("alpha", 1.0)
    beta = node.get_attr("beta", 1.0)
    transX = node.get_attr("transA", 0)
    transW = node.get_attr("transB", 0)

    def fn():
        with cupy.cuda.Device(node.device_id):
            if transX == 1:
                #xt = chainer.functions.transpose(x)
                xt = cupy.transpose(x)
            else:
                xt = x
            if transW == 1:
                #wt = chainer.functions.transpose(w)
                wt = cupy.transpose(w)
            else:
                wt = w

            z = cupy.dot(alpha * xt, wt)
            cupy.add(z, beta * b, out=y)

    return fn
Example #21
0
def conv_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:
    """
        Function:
            Y = X CONV W (Using padding, stride and dilaton attribute
    """
    x_io = node.inputs["X"]
    w_io = node.inputs["W"]
    b_io = node.get_input("B")
    y_io = node.outputs["Y"]

    x = x_io.get_data(alloc_map)
    w = w_io.get_data(alloc_map)
    b = b_io.get_data(alloc_map)
    y = y_io.get_data(alloc_map)

    # Assuming same stride in all directions
    stride = node.get_attr("strides", [1])[0]
    # Assuming same padding in all directions
    padding = node.get_attr("pads", [0])[0]
    dilations = node.get_attr(
        "dilations", [1])[0]  # Assuming same padding in all directions
    groups = node.get_attr("group", 1)

    def fn():
        np.copyto(
            y,
            (chainer.functions.convolution_2d(
                x,
                w,
                b=b,
                stride=stride,
                pad=padding,
                dilate=dilations,
                groups=groups,
            )).array,
        )

    return fn
Example #22
0
def build_replicated_node(other, node_id, instance_id, suffix):

    inputs = {}
    for inp_name, inp_io in other.inputs.items():
        inputs[inp_name] = build_replicated_io(inp_io, suffix)

    outputs = {}
    for outp_name, outp_io in other.outputs.items():
        outputs[outp_name] = build_replicated_io(outp_io, suffix)

    attrs = {}
    for attr_name, attr_v in other.attrs.items():
        attrs[attr_name] = attr_v

    new_node = Node(node_id, other.operator, inputs, outputs, attrs,
                    instance_id)

    return new_node
Example #23
0
    def test_conv_stride(self):

        c = Config(None, None, 4, 4)

        io_in = InOut("in", "static", np.ndarray((4, 3, 22, 22)), (4, 3, 22, 22))

        io_kern = InOut("kern", "static", np.ndarray((1, 3, 3, 3)), (1, 3, 3, 3))

        io_bias = InOut("bias", "static", np.ndarray((1)), (1))

        io_out = InOut("out", "dynamic", None, (4, 1, 10, 10))

        i = np.random.random(np.shape(io_in.data))
        w = np.random.random(np.shape(io_kern.data))
        b = np.random.random(np.shape(io_bias.data))

        np.copyto(io_in.data, i)
        np.copyto(io_kern.data, w)
        np.copyto(io_bias.data, b)

        # ---TEST 3: X,W,B default attrs
        am = {"out": np.ndarray((4, 1, 10, 10))}
        inp = {"X": io_in, "W": io_kern, "B": io_bias}
        oup = {"Y": io_out}
        attrs = {
            "dilations": (1, 1),
            "group": (1),
            "kernel_shape": (3, 3),
            "pads": (0, 0, 0, 0),
            "strides": (2, 2, 2, 2),
        }

        n = Node(0, ops.CONV, inp, oup, attrs, 0)
        fn = kernels.conv_cpu(n, am, c)

        # chainer with previous config
        o = chainer.functions.convolution_2d(
            i, w, b=b, stride=(2, 2), pad=(0, 0), dilate=(1, 1), groups=1
        ).array
        fn()

        np.testing.assert_array_almost_equal(o, io_out.get_data(am))
Example #24
0
    def test_maxpool_big_stride(self):

        B = 4
        C = 4
        H = 22
        W = 22

        K_size = (3, 3)

        in_shape = (B, C, H, W)
        out_shape = (B, C, 7, 7)

        c = Config(None, None, B, B)

        io_in = InOut("in", "static", np.ndarray(in_shape), in_shape)
        io_out = InOut("out", "dynamic", None, out_shape)

        i = np.random.random(np.shape(io_in.data))

        np.copyto(io_in.data, i)

        ref_mod = torch.nn.MaxPool2d(
            K_size, stride=3, dilation=1, padding=0, ceil_mode=False
        )

        torch_i = torch.tensor(i)
        ref = ref_mod(torch_i).numpy()

        am = {"out": np.ndarray(out_shape)}
        inp = {"X": io_in}
        oup = {"Y": io_out}
        attrs = {"kernel_shape": K_size, "strides": (3, 3, 3, 3)}

        n = Node(0, ops.MAXPOOL, inp, oup, attrs, 0)

        test_fn = kernels.maxpool_cpu(n, am, c)

        test_fn()

        np.testing.assert_array_almost_equal(io_out.get_data(am), ref)
Example #25
0
    def test_copy(self):

        c = Config(None, None, 4, 4)

        size = (4,3,224,224)

        io_in = InOut("in", "static", np.random.random(size), size)

        io_gpu = InOut("gpu", "dynamic", None, size)

        io_return = InOut("return", "dynamic", None, size)

        with cupy.cuda.Device(0):
            gpu_buffer = cupy.ndarray((size))

        am = {"gpu": gpu_buffer,
              "return": np.ndarray((size))}

        inp_c0 = {"X": io_in}
        oup_c0 = {"Z": io_gpu}


        inp_c1 = {"X": io_gpu}
        oup_c1 = {"Z": io_return}

        c0 = Node(0, ops.O2P_COPY, inp_c0, oup_c0, {}, 0)
        c0.device_id = 0
        c1 = Node(0, ops.O2P_COPY, inp_c1, oup_c1, {}, 0)
        c1.device_id = 0

        fn_c0 = kernels.copy(c0, am, c)
        fn_c1 = kernels.copy(c1, am, c)

        #copy to gpu
        fn_c0()

        #execute +1
        cupy.copyto(gpu_buffer,gpu_buffer + 1)

        #copy back
        fn_c1()

        ref_plus_one = io_in.get_data(am) + 1

        cupy.testing.assert_array_equal(io_gpu.get_data(am), ref_plus_one)
        np.testing.assert_equal(io_return.get_data(am), ref_plus_one)
Example #26
0
def clip_v11_cpu(node: Node, alloc_map, config: Config) -> Callable[[], None]:

    input_io = node.inputs["input"]
    min_io = node.get_input("min")
    max_io = node.get_input("max")

    output_io = node.outputs["output"]

    inp = input_io.get_data(alloc_map)
    min_data = min_io.get_data(alloc_map)
    if min_data is None:
        min_data = [-np.inf]
    max_data = max_io.get_data(alloc_map)
    if max_data is None:
        max_data = [np.inf]

    output = output_io.get_data(alloc_map)

    def fn():
        np.copyto(output,
                  chainer.functions.clip(inp, min_data[0], max_data[0]).array)

    return fn
Example #27
0
def build_kernel(node: Node, alloc_map: Dict[str, np.ndarray],
                 config: Config) -> Callable[[], None]:
    """
    For each node in graph build a function for execution on the correct device
    """

    oper = node.get_operator()
    if oper == ops.ADD:
        if node.device_type == "cpu":
            return kernels.add_cpu(node, alloc_map, config)
        else:
            return kernels.add_gpu(node, alloc_map, config)
    if oper == ops.O2P_LOAD:
        return kernels.load_cpu(node, alloc_map, config)
    if oper == ops.O2P_STORE:
        return kernels.store_cpu(node, alloc_map, config)
    if oper == ops.O2P_COPY:
        return kernels.copy(node, alloc_map, config)

    if oper == ops.CONV:
        if node.device_type == "cpu":
            return kernels.conv_cpu(node, alloc_map, config)
        else:
            return kernels.conv_gpu(node, alloc_map, config)
    if oper == ops.BATCH_NORM:
        if node.device_type == "cpu":
            return kernels.batchnorm_cpu(node, alloc_map, config)
        else:
            return kernels.batchnorm_gpu(node, alloc_map, config)
    if oper == ops.RELU:
        if node.device_type == "cpu":
            return kernels.relu_cpu(node, alloc_map, config)
        else:
            return kernels.relu_gpu(node, alloc_map, config)
    if oper == ops.MAXPOOL:
        if node.device_type == "cpu":
            return kernels.maxpool_cpu(node, alloc_map, config)
        else:
            return kernels.maxpool_gpu(node, alloc_map, config)
    if oper == ops.GLOBALAVERAGEPOOL:
        if node.device_type == "cpu":
            return kernels.globalAveragePool_cpu(node, alloc_map, config)
        else:
            return kernels.globalAveragePool_gpu(node, alloc_map, config)

    if oper == ops.AVERAGE_POOL:
        if node.device_type == "cpu":
            return kernels.average_pool_cpu(node, alloc_map, config)
        else:
            return kernels.average_pool_gpu(node, alloc_map, config)

    if oper == ops.PAD:
        if node.device_type == "cpu":
            return kernels.pad_cpu(node, alloc_map, config)
        else:
            raise NotImplementedError()

    if oper == ops.FLATTEN:
        if node.device_type == "cpu":
            return kernels.flatten_cpu(node, alloc_map, config)
        else:
            return kernels.flatten_gpu(node, alloc_map, config)

    if oper == ops.RESHAPE:
        if node.device_type == "cpu":
            return kernels.reshape_cpu(node, alloc_map, config)
        else:
            return kernels.reshape_gpu(node, alloc_map, config)

    if oper == ops.GEMM:
        if node.device_type == "cpu":
            return kernels.gemm_cpu(node, alloc_map, config)
        else:
            return kernels.gemm_gpu(node, alloc_map, config)

    if oper == ops.DROPOUT:
        if node.device_type == "cpu":
            return kernels.dropout_cpu(node, alloc_map, config)
        else:
            return kernels.dropout_gpu(node, alloc_map, config)

    if oper == ops.CLIP:
        if node.device_type == "cpu":
            return kernels.clip_v6_cpu(node, alloc_map, config)
        else:
            return kernels.clip_v6_gpu(node, alloc_map, config)

    if oper == ops.REDUCE_MEAN:
        if node.device_type == "cpu":
            return kernels.reduce_mean_cpu(node, alloc_map, config)
        else:
            return kernels.reduce_mean_gpu(node, alloc_map, config)

    if oper == ops.O2P_GRAPH_HEAD:
        return None

    raise ValueError(f"Operator {oper} not supported")
Example #28
0
    def test_batchnorm_defaults(self):

        B = 4
        C = 4
        H = 22
        W = 22

        K_size = (3, 3)

        in_shape = (B, C, H, W)
        out_shape = (B, C, H, W)

        c = Config(None, None, B, B)

        io_in = InOut("in", "static", np.ndarray(in_shape), in_shape)
        io_scale = InOut("scale", "static", np.ndarray((C)), (C))
        io_B = InOut("B", "static", np.ndarray((C)), (C))
        io_mean = InOut("mean", "static", np.ndarray((C)), (C))
        io_var = InOut("var", "static", np.ndarray((C)), (C))
        io_out = InOut("out", "dynamic", None, out_shape)

        np.random.seed(123)

        i = np.random.random(np.shape(io_in.data))
        s = np.random.random(np.shape(io_scale.data))
        b = np.random.random(np.shape(io_B.data))
        mean = np.random.random(np.shape(io_mean.data))
        var = np.random.random(np.shape(io_var.data))

        np.copyto(io_in.data, i)
        np.copyto(io_scale.data, s)
        np.copyto(io_B.data, b)
        np.copyto(io_mean.data, mean)
        np.copyto(io_var.data, var)

        eps = 1e-05
        momentum_torch = 0.5
        momentum_test = 0.4

        torch_i = torch.tensor(i)
        torch_w = torch.tensor(s)
        torch_b = torch.tensor(b)
        torch_mean = torch.tensor(mean)
        torch_var = torch.tensor(var)

        ref = torch.nn.functional.batch_norm(
            torch_i,
            torch_mean,
            torch_var,
            weight=torch_w,
            bias=torch_b,
            training=False,
            momentum=momentum_torch,
            eps=eps,
        ).numpy()

        ref_chainer = chainer.functions.fixed_batch_normalization(
            i,
            s,
            b,
            mean,
            var,
            eps=eps,
        ).array

        am = {"out": np.ndarray(out_shape)}
        inp = {"X": io_in, "scale": io_scale, "B": io_B, "mean": io_mean, "var": io_var}
        oup = {"Y": io_out}
        attrs = {"epsilon": eps, "momentum": momentum_test}

        n = Node(0, ops.BATCH_NORM, inp, oup, attrs, 0)

        test_fn = kernels.batchnorm_cpu(n, am, c)

        test_fn()

        #np.testing.assert_array_almost_equal(ref, ref_chainer)
        np.testing.assert_array_almost_equal(io_out.get_data(am), ref_chainer)
Example #29
0
def opt_graph_split(graph: nx.DiGraph, alloc_map: Dict[str, np.ndarray],
                    config: Config) -> None:

    # add the generic head node to the graph
    # connect it to the initial root node generated by frontend
    graph.add_node(PNO_GRAPH_HEAD_ID)
    graph.add_edge(PNO_GRAPH_HEAD_ID, 0)

    graph.nodes[PNO_GRAPH_HEAD_ID]["node"] = Node(-1, ops.O2P_GRAPH_HEAD, {},
                                                  {}, {}, 0)

    # need to rename and assign to the correct device
    cuda_devices = get_valid_cuda_devices()
    num_cuda = len(cuda_devices)

    config.computed_batch_size = num_cuda * config.user_width
    # there is now +1 node in the graph because of the -1 head
    new_gnode = graph.number_of_nodes() - 1

    if num_cuda > 0:

        # compute the correct split

        # gpu_name_maps = [{}] * num_cuda
        gpu_name_maps = [{} for i in range(num_cuda)]

        # source_gnode -> local_gnode

        # add a mapping from og graph head to graph head for all devices
        for i in range(num_cuda):
            gpu_name_maps[i][PNO_GRAPH_HEAD_ID] = PNO_GRAPH_HEAD_ID

        # start at the initial node of the non-replicated graph
        fixed_list = list(nx.topological_sort(graph))

        # skip the HEAD node
        for source_gnode in fixed_list[1:]:
            source_node = graph.nodes[source_gnode]["node"]

            gparents = list(graph.predecessors(source_gnode))

            for gpu_idx, device_id in enumerate(cuda_devices):

                device_node = build_replicated_node(source_node, new_gnode,
                                                    gpu_idx, f"_g{device_id}")

                # configure device settings for the new node
                if source_node.device_type == "gpu":
                    device_node.device_type = "gpu"
                    device_node.device_id = device_id
                else:
                    device_node.device_type = "cpu"
                    device_node.device_id = 0

                graph.add_node(new_gnode)
                graph.nodes[new_gnode]["node"] = device_node

                # look up source node parent in gpu_name_maps
                for gparent in gparents:
                    edge_source = gpu_name_maps[gpu_idx][gparent]
                    graph.add_edge(edge_source, new_gnode)

                # add ourself to the gpu name map
                gpu_name_maps[gpu_idx][source_gnode] = new_gnode

                new_gnode += 1

        # remove the og graph
        for gnode in fixed_list[1:]:
            graph.remove_node(gnode)

    return
Example #30
0
def copy(node: Node, alloc_map, config: Config):
    x_io = node.inputs["X"]
    z_io = node.outputs["Z"]

    x = x_io.get_data(alloc_map)
    z = z_io.get_data(alloc_map)

    source_device_id = node.get_attr("source_device")[1]
    target_device_id = node.get_attr("target_device")[1]

    tz = type(z)
    tx = type(x)

    def fn():
        # time_st = datetime.datetime.now()

        if tz == numpy.ndarray:  # to cpu
            np.copyto(z, cupy.asnumpy(x))
            # assert cupy.testing.assert_array_equal(z,x)

        if tz == cupy.core.core.ndarray and tx != cupy.core.core.ndarray:  # to gpu
            with cupy.cuda.Device(node.device_id):
                cupy.add(cupy.asarray(x), 0, out=z)

        if tz == cupy.core.core.ndarray and tx == cupy.core.core.ndarray:  # to gpu
            tmp = None
            with cupy.cuda.Device(source_device_id):
                tmp = cupy.asnumpy(x)
            with cupy.cuda.Device(target_device_id):
                cupy.copyto(z, cupy.asarray(tmp))

            # assert cupy.testing.assert_array_equal(z,x)

            # assert z.shape == x.shape
            # cupy.cuda.get_current_stream().synchronize()
            # tmp = cupy.asarray(x)
            # cupy.cuda.get_current_stream().synchronize()

            # neq = cupy.count_nonzero(cupy.logical_not(z==tmp))
            # print(neq)
            # assert cupy.testing.assert_array_equal(z,tmp)
            # to gpu:

        # og_shape = x.shape

        # if tz == numpy.ndarray:  # to cpu
        #    with cupy.cuda.Device(device=node.device_id):
        #        arr_flat = x.reshape((-1))
        #        z_flat = np.ndarray(arr_flat.shape)

        #        for i, v in enumerate(arr_flat):
        #            z_flat[i] = v

        #        z_flat = z_flat.reshape(og_shape)

        #        np.copyto(z,z_flat)

        # if tz == cupy.core.core.ndarray:

        #    arr_flat = x.reshape((-1))

        #    with cupy.cuda.Device(device=node.device_id):
        #        z_flat = cupy.ndarray(arr_flat.shape)

        #        for i, v in enumerate(arr_flat):
        #            z_flat[i] = v

        #        z_flat = z_flat.reshape(og_shape)

        #        cupy.copyto(z,z_flat)

        # time_end = datetime.datetime.now()
        # logging.log(logging.INFO, f"done copy {z}, {tz}")
        # logging.log(logging.INFO, f"TIMER: <{node.operator},{node.node_id} {time_st} -> {time_end}")

    return fn