Example #1
0
def setup_env():
    num_concurrent_streams = Config.get("compiler", "cuda",
                                        "max_concurrent_streams")
    if 'ORT_USE_STREAMS' in os.environ:
        ONNXRuntimeCUDA.use_streams = _env2bool(os.environ["ORT_USE_STREAMS"])
        if ONNXRuntimeCUDA.use_streams:
            log.info("Using streams with ORT (experimental)")
            if num_concurrent_streams == 0:
                log.info("Setting compiler.cuda.max_concurrent_streams to 8")
                Config.set("compiler",
                           "cuda",
                           "max_concurrent_streams",
                           value=8)
            elif num_concurrent_streams == -1:
                ONNXRuntimeCUDA.use_streams = False
    else:
        if num_concurrent_streams != -1:
            log.info("Setting compiler.cuda.max_concurrent_streams to -1")
            Config.set("compiler", "cuda", "max_concurrent_streams", value=-1)
        ONNXRuntimeCUDA.use_streams = False
    ONNXRuntimeCUDA.max_concurrent_streams = Config.get(
        "compiler", "cuda", "max_concurrent_streams")
Example #2
0
    sdfg.add_edge(state2, copy_out_state,
                         dace.sdfg.sdfg.InterstateEdge())
    sdfg.validate()

    return sdfg


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("N", type=int, nargs="?", default=32)
    args = vars(parser.parse_args())

    size_n = args["N"]

    from dace.config import Config
    # set unique function to false to generate both sdfgs
    Config.set("compiler", "unique_functions", value=False)
    sdfg = make_nested_sdfg_fpga()

    X = np.random.rand(size_n).astype(np.float32)
    Y = np.random.rand(size_n).astype(np.float32)
    sdfg(X=X, Y=Y, N=size_n)
    ref = X+2
    diff = np.linalg.norm(ref - Y) / size_n
    if diff <= 1e-5 :
        print("==== Program end ====")
    else:
        raise Exception("==== Program Error! ====")

Example #3
0
    @dace.program
    def matmul_np(A: dace.float64[128, 64], B: dace.float64[64, 32],
                  C: dace.float64[128, 32]):
        C[:] = A @ B

    A = np.random.rand(128, 64).astype(np.float64)
    B = np.random.rand(64, 32).astype(np.float64)
    C = np.random.rand(128, 32).astype(np.float64)

    sdfg = matmul_np.to_sdfg()
    sdfg.apply_transformations([FPGATransformSDFG])
    from dace.libraries.blas import Gemm
    Gemm.default_implementation = "FPGA1DSystolic"
    # We have to Inline
    sdfg.expand_library_nodes()
    sdfg.apply_transformations_repeated([InlineSDFG])
    C_regression = A @ B
    p = Process(target=evaluate, args=(sdfg, A, B, C, C_regression))
    p.start()
    p.join()
    del sdfg


if __name__ == "__main__":
    # These tests will be executed on seperate process. The tranfform_on_call DACE configuration must be set to false
    Config.set('optimizer', 'transform_on_call', value=False)

    test_gemm_vectorized()
    test_gemm_size_not_multiples_of()
    test_matmul_np()