Example #1
0
    async def f():
        nrows = int(1e7)
        nnz = nrows * 10
        n_repeats = 1
        mat = random_test_matrix(nrows, nnz)
        vec = np.random.rand(nrows) - 0.5
        t = tsk.Timer()
        for i in range(n_repeats):
            correct = mat.dot(vec)
        t.report("simple dot")

        gang = await tsk.ctx().wait_for_workers(cfg.n_workers)
        t.report("wait for workers")

        t.report("launch profiler")
        tsk_vec = TskArray(vals=vec)
        t.report("shmem v")

        tsk_mat = distribute(mat, gang)
        t.report("distribute mat")

        result = await tsk_mat.dot(tsk_vec)
        t.report("first dot")

        async with tsk.Profiler(gang):
            t.restart()
            for i in range(n_repeats):
                result = await tsk_mat.dot(tsk_vec)
            t.report("parallel dot")

        print(np.sum(correct))
        print(np.sum(result))
        assert np.sum(result) == np.sum(correct)
Example #2
0
 async def f():
     gang = await tsk.ctx().wait_for_workers(2)
     fnc_dref = tsk.put(long_fnc)
     async with tsk.Profiler(gang):
         start = time.time()
         await wait_all([
             tsk.task(fnc_dref, to=gang[i % len(gang)])
             for i in range(n_jobs)
         ])
         print("inside: ", time.time() - start)
Example #3
0
async def submit(w):
    n = int(4e8)
    ref = tsk.alloc(w, n * 8)
    A = np.frombuffer(await ref.get(), dtype=np.float64)
    A[:] = np.random.rand(n)
    rhs = np.sum(A)
    for to in range(2):
        async with tsk.Profiler(w, range(2)):
            # ref = tsk.put(w, A.data.cast('B'))
            async def remote(w):
                A = np.frombuffer(await ref.get(), dtype=np.float64)
                return np.sum(A)

            lhs = await tsk.task(w, remote, to=1)
            assert (lhs == rhs)
Example #4
0
async def submit():
    gang = await tsk.ctx().wait_for_workers(2)
    n = int(4e8)
    ref = tsk.alloc(n * 8)
    A = np.frombuffer(await ref.get(), dtype=np.float64)
    A[:] = np.random.rand(n)
    rhs = np.sum(A)
    for i in range(2):
        async with tsk.Profiler(gang):
            # ref = tsk.put(w, A.data.cast('B'))
            async def remote():
                A = np.frombuffer(await ref.get(), dtype=np.float64)
                return np.sum(A)

            lhs = await tsk.task(remote, to=gang[1])
            assert lhs == rhs
Example #5
0
async def submit(w):
    t = tsk.Timer()
    nrows = int(1e8)
    # nnz = 5 * nrows

    A = make_test_matrix(nrows, 1)
    t.report('build csr')
    v = np.random.rand(nrows)
    t.report('gen v')
    correct = v.copy()  #np.empty(A.shape[0])
    for i in range(100):
        v[:] = correct
        t.report('copy')
        _sparse.csrmv(A.indptr, A.indices, A.data, v, correct, True)
        t.report('csrmv')

    data_dref = tsk.put(w, value=A.data.data.cast('B'), eager_alloc=1)
    indptr_dref = tsk.put(w, value=A.indptr.data.cast('B'), eager_alloc=1)
    indices_dref = tsk.put(w, value=A.indices.data.cast('B'), eager_alloc=1)
    v_dref = tsk.put(w, value=v.data.cast('B'), eager_alloc=1)
    t.report('put matrix')

    print('total bytes',
          A.data.nbytes + A.indptr.nbytes + A.indices.nbytes + v.nbytes * 2)

    async def dot_chunk(w, args):
        t = tsk.Timer(None)

        start_row, end_row, out_dref = args
        # print(start_row, end_row)

        v_buf = await tsk.remote_get(w, v_dref)
        out_buf = await tsk.remote_get(w, out_dref)
        data_buf = await tsk.remote_get(w, data_dref)
        indptr_buf = await tsk.remote_get(w, indptr_dref)
        indices_buf = await tsk.remote_get(w, indices_dref)

        v = np.frombuffer(v_buf, dtype=np.float64)
        indptr = np.frombuffer(indptr_buf,
                               dtype=np.int32)[start_row:end_row + 1]
        indices = np.frombuffer(indices_buf, dtype=np.int32)
        data = np.frombuffer(data_buf, dtype=np.float64)
        out = np.frombuffer(out_buf, dtype=np.float64)

        t.report(str(w.addr) + ' setup')

        inner_chunk_size = int(1e9)
        for i in range(0, indptr.shape[0], inner_chunk_size):
            # print(start_row, i, i+inner_chunk_size)
            _sparse.csrmv(
                indptr[i:(i + inner_chunk_size + 1)], indices, data, v,
                out[(start_row + i):(start_row + i + inner_chunk_size)], False)
            await asyncio.sleep(0)
        t.report(str(w.addr) + ' dot')

    dot_chunk_dref = put_fnc(w, dot_chunk)
    t.report('put fnc')

    out_dref = tsk.alloc(w, nrows * 8)
    out = np.frombuffer(w.memory.get_local(out_dref), dtype=np.float64)
    t.report('alloc out')

    n_super_chunks = int(np.floor(np.sqrt(n_cores)))
    async with tsk.Profiler(w, range(1)):
        for i in range(50):
            await map(w,
                      dot_chunk_dref,
                      nrows,
                      out_dref,
                      n_super_chunks=n_super_chunks)
            t.report('dot')
Example #6
0
async def submit2(w):
    t = tsk.Timer()
    nrows = int(5e7)
    n_super_chunks = int(np.floor(np.sqrt(n_cores)))

    # A = make_test_matrix(nrows, 1)
    # t.report('build csr')
    # v = np.random.rand(nrows)
    # t.report('gen v')
    # correct = np.empty(A.shape[0])
    # _sparse.csrmv(A.indptr, A.indices, A.data, v, correct, True)
    # t.report('serial1')

    # v_dref = w.memory.put(value = v.data.cast('B'), eager_alloc = 1)
    # data_dref = w.memory.put(value = A.data.data.cast('B'), eager_alloc = 1)
    # indptr_dref = w.memory.put(value = A.indptr.data.cast('B'), eager_alloc = 1)
    # indices_dref = w.memory.put(value = A.indices.data.cast('B'), eager_alloc = 1)
    # t.report('put matrix')

    # async def build_local_matrix(w, args):
    #     t = tsk.Timer(output_fnc = lambda x: None)
    #     start_row, end_row = args

    #     data_buf = await tsk.remote_get(w, data_dref)
    #     indptr_buf = await tsk.remote_get(w, indptr_dref)
    #     indices_buf = await tsk.remote_get(w, indices_dref)

    #     indptr = np.frombuffer(indptr_buf, dtype = np.int32)[start_row:end_row+1].copy()
    #     indices = np.frombuffer(indices_buf, dtype = np.int32)[indptr[0]:indptr[-1]].copy()
    #     data = np.frombuffer(data_buf, dtype = np.float64)[indptr[0]:indptr[-1]].copy()
    #     indptr -= indptr[0]

    #     out = np.empty(end_row - start_row)

    #     matrix = (indptr, indices, data, out)
    #     matrix_dref = tsk.put(w, value = matrix)

    #     v_buf = await tsk.remote_get(w, v_dref)
    #     v = np.frombuffer(v_buf, dtype = np.float64).copy()
    #     v_dref_out = tsk.put(w, value = v)
    #     t.report(str(w.addr) + ' distribute')

    #     return (v_dref_out, matrix_dref)
    # build_dref = put_fnc(w, build_local_matrix)

    # def rand_vec(w, args):
    #     start_row, end_row = args
    #     v_dref = tsk.put(w, value = np.random.rand(end_row - start_row).data.cast('B'), eager_alloc = 1)
    #     return v_dref
    # v_chunks = await map(w, rand_vec, nrows, n_super_chunks = n_super_chunks)

    v = np.random.rand(nrows)
    v_dref = w.memory.put(value=v.data.cast('B'), eager_alloc=1)
    t.report('gen v')

    async def build_matrix(w, args):
        start_row, end_row = args
        A = make_test_matrix(end_row - start_row, 1)
        out = np.empty(end_row - start_row)

        matrix = (A.indptr, A.indices, A.data, out)
        matrix_dref = tsk.put(w, value=matrix)

        # v_dref_out = v_dref
        # vs = []
        # for vc in v_chunks:
        #     vs.append(np.frombuffer(await tsk.remote_get(w, vc), dtype = np.float64))
        # v = np.concatenate(vs)
        v_buf = await tsk.remote_get(w, v_dref)
        v = np.frombuffer(v_buf, dtype=np.float64).copy()
        v_dref_out = tsk.put(w, value=v)

        return (v_dref_out, matrix_dref)

    build_dref = put_fnc(w, build_matrix)

    matrix_chunks = await map(w,
                              build_dref,
                              nrows,
                              n_super_chunks=n_super_chunks)
    t.report('distribute matrix')

    async def dot(w, args):
        t = tsk.Timer(output_fnc=lambda x: None)
        if w.addr == 10:
            t = tsk.Timer()
        istart, iend, st = args
        assert (iend == istart + 1)
        # st, i = args
        v_dref, matrix_dref = matrix_chunks[istart]
        # print(w.addr, 'took', time.time() - st, 'to launch')

        indptr, indices, data, out = w.memory.get_local(matrix_dref)
        v = w.memory.get_local(v_dref)
        # v_buf = await tsk.remote_get(w, v_dref)
        # v = np.frombuffer(v_buf, dtype = np.float64)

        inner_chunk_size = int(1e9)
        t.report(str(w.addr) + ' setup')

        for i in range(0, indptr.shape[0], inner_chunk_size):
            _sparse.csrmv(indptr[i:(i + inner_chunk_size + 1)], indices, data,
                          v, out[i:(i + inner_chunk_size)], False)
            await asyncio.sleep(0)
        t.report(str(w.addr) + ' dot')
        # return out_dref

    dot_dref = put_fnc(w, dot)

    # await run_dot()
    async with tsk.Profiler(w, range(0)):
        t.report('put/startprof')
        for i in range(4):
            print('')
            print('')
            print('')
            await map(w,
                      dot_dref,
                      len(matrix_chunks),
                      time.time(),
                      n_super_chunks=n_super_chunks)
            t.report('dot')