Ejemplo n.º 1
0
    async def test_tsqr_blocked(placement=cpu):
        for i in range(WARMUP + ITERS):
            # Reset all iteration-specific timers and counters
            perf_stats.reset()

            # Original matrix
            np.random.seed(i)
            A = np.random.rand(NROWS, NCOLS)

            if PLACEMENT_STRING == 'puregpu':
                if (NROWS % NGPUS != 0):
                    raise ValueError(
                        "Pure GPU version requires NROWS %% NGPUS == 0 (currently %i %% %i)"
                        % (NROWS, NGPUS))

                # Partition matrix on GPUs
                mapper = LDeviceSequenceBlocked(
                    NGPUS, placement=[gpu(dev) for dev in range(NGPUS)])
                A_dev = mapper.partition_tensor(A)

                tot_start = time()
                Q_dev, R_dev = await tsqr_blocked_puregpu(A_dev, BLOCK_SIZE)
                tot_end = time()

                # Copy the data back
                if CHECK_RESULT:
                    Q = np.empty(shape=(0, NCOLS))
                    for dev in range(NGPUS):
                        with cp.cuda.Device(dev):
                            Q = np.vstack((Q, cp.asnumpy(Q_dev[dev])))

                    R = cp.asnumpy(R_dev)

            else:  # Normal version
                # Run and time the algorithm
                tot_start = time()
                Q, R = await tsqr_blocked(A, BLOCK_SIZE)
                tot_end = time()

            perf_stats.tot_time = tot_end - tot_start

            # Combine task timings into totals for this iteration
            perf_stats.consolidate_stats()

            if (i >= WARMUP):
                iter = i - WARMUP
                if CSV:
                    perf_stats.print_stats_csv(iter)
                else:
                    perf_stats.print_stats(iter)

            # Check the results
            if CHECK_RESULT:
                if check_result(A, Q, R):
                    print("\nCorrect result!\n")
                else:
                    print("%***** ERROR: Incorrect final result!!! *****%")
Ejemplo n.º 2
0
def main():
    comm = MPI.COMM_WORLD
    print(comm.Get_rank(), comm.Get_size())

    a = np.random.rand(10000000).astype(dtype='d')
    b = np.random.rand(10000000).astype(dtype='d')

    divisions = 100

    comm.Barrier()
    start = time.perf_counter()
    # Map the divisions onto actual hardware locations
    mapper = LDeviceSequenceBlocked(divisions)
    # print(mapper.devices)

    a_part = mapper.partition_tensor(a)
    b_part = mapper.partition_tensor(b)

    inner_result = np.empty(1, dtype='d')

    @spawn(placement=cpu(0))
    async def inner_part():
        partial_sums = np.empty(divisions)
        async with finish():
            for i in range(divisions):

                @spawn(placement=mapper.device(i))
                def inner_local():
                    copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])

        res = 0.
        for i in range(divisions):
            res += partial_sums[i]
        inner_result[0] = res

    overall_result = np.array(0.0, dtype='d') if comm.Get_rank() == 0 else None
    comm.Reduce([inner_result, MPI.DOUBLE], [overall_result, MPI.DOUBLE],
                op=MPI.SUM,
                root=0)
    if overall_result is not None:
        result = float(overall_result)
        print(result)
        end = time.perf_counter()
        print(end - start)

    assert np.allclose(np.inner(a, b), inner_result[0])

    other_results = np.empty(comm.Get_size(),
                             dtype='d') if comm.Get_rank() == 0 else None
    comm.Gather([inner_result, MPI.DOUBLE], [other_results, MPI.DOUBLE],
                root=0)
    if overall_result is not None:
        assert np.isclose(result, np.sum(other_results))
Ejemplo n.º 3
0
def main():
    n = 3 * 100000000
    a = np.random.rand(n)
    b = np.random.rand(n)

    divisions = 100

    start = time.perf_counter()
    # Map the divisions onto actual hardware locations
    devs = list(gpu.devices) + list(cpu.devices)
    if "N_DEVICES" in os.environ:
        devs = devs[:int(os.environ.get("N_DEVICES"))]
    mapper = LDeviceSequenceBlocked(divisions, devices=devs)

    a_part = mapper.partition_tensor(a)
    b_part = mapper.partition_tensor(b)

    inner_result = np.empty(1)

    @spawn()
    async def inner_part():
        # Create array to store partial sums from each logical device
        partial_sums = np.empty(divisions)
        # Start a block of tasks that much all complete before leaving the block.
        async with finish():
            # For each logical device, perform the local inner product using the numpy multiply operation, @.
            for i in range(divisions):

                @spawn(devices=[
                    Req(mapper.device(i),
                        threads=1,
                        memory=storage_size(a_part[i], b_part[i]))
                ])
                def inner_local():
                    copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])

        # Reduce the partial results (sequentially)
        res = 0.
        for i in range(divisions):
            res += partial_sums[i]
        inner_result[0] = res

    @spawn(None, [inner_part])
    def check():
        end = time.perf_counter()
        print(end - start)

        assert np.allclose(np.inner(a, b), inner_result[0])
Ejemplo n.º 4
0
def main():
    divisions = 10
    mapper = LDeviceSequenceBlocked(divisions)

    async def inner(a, b):
        a_part = mapper.partition_tensor(a)
        b_part = mapper.partition_tensor(b)
        # Create array to store partial sums from each logical device
        partial_sums = np.empty(len(a_part))
        # Define a space of task names for the product tasks
        P = TaskSpace("P")
        for i in range(len(a_part)):

            @spawn(P[i], data=[a_part[i], b_part[i]])
            def inner_local():
                # Perform the local inner product using the numpy multiply operation, @.
                copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])

        @spawn(dependencies=P, data=[partial_sums])
        def reduce():
            return np.sum(partial_sums)

        return await reduce

    @spawn()
    async def main_task():
        n = 3 * 1000
        a = np.random.rand(n)
        b = np.random.rand(n)
        print("Starting.", a.shape, b.shape)
        res = await inner(a, b)
        assert np.allclose(np.inner(a, b), res)
        print("Success.", res)
Ejemplo n.º 5
0
def main():
    ngpus = int(sys.argv[1])
    runs = int(sys.argv[2])
    blocks_per_gpu = int(sys.argv[3])
    devices = gpu.devices[:ngpus]
    # 1D partition over available devices
    mapper = LDeviceSequenceBlocked(ngpus * blocks_per_gpu, placement=devices)

    # Generate an nxn array of random data and
    # partition it over the devices in use.
    n = 20000 * 20000

    # Main task that generates others.
    @spawn(placement=cpu)
    async def rerun_exp():
        for run in range(runs):

            @spawn(placement=cpu)
            async def launch_exp():
                np.random.seed(0)
                a_cpu = np.random.rand(n).astype(np.float32)
                #a_part = mapper.partition_tensor(a_cpu)
                a_part = []
                nblocks = ngpus * blocks_per_gpu
                block_size = (n - 1) // nblocks + 1
                for i in range(nblocks):
                    with cp.cuda.Device(i % ngpus):
                        a_part.append(
                            cp.asarray(a_cpu[i * block_size:(i + 1) *
                                             block_size]))
                start = time.perf_counter()
                # A place to store tasks in order to refer
                # to them later for dependencies.
                exp_runs = TaskSpace("exp_runs")
                for i in range(ngpus * blocks_per_gpu):
                    # Launch a task for each GPU.
                    # These execute asynchronously.
                    @spawn(exp_runs[i], placement=a_part[i])
                    def run_exp():
                        # Call cupy for exponentiation.
                        # More complicated kernels can use numba.
                        #local_start = time.perf_counter()
                        cp.exp(a_part[i], out=a_part[i])
                        #a_loc = a_part[i]
                        #blocks = a_loc.shape[0] // (1024)
                        #threads_per_block = 512
                        #inplace_exp[blocks, threads_per_block](a_loc)
                        #cuda.default_stream().synchronize()
                        #cp.cuda.get_current_stream().synchronize()
                        #local_stop = time.perf_counter()
                        #print("local:", local_stop - local_start)

                # Wait for the exp tasks to complete
                # before measuring the end time.
                await exp_runs
                stop = time.perf_counter()
                print(stop - start)

            await launch_exp
Ejemplo n.º 6
0
def main():
    n = 3 * 100000000
    a = np.random.rand(n)
    b = np.random.rand(n)

    divisions = 100

    start = time.perf_counter()
    # Map the divisions onto actual hardware locations
    devs = list(gpu.devices) + list(cpu.devices)
    if "N_DEVICES" in os.environ:
        devs = devs[:int(os.environ.get("N_DEVICES"))]
    mapper = LDeviceSequenceBlocked(divisions, devices=devs)

    a_part = mapper.partition_tensor(a)
    b_part = mapper.partition_tensor(b)

    inner_result = np.empty(1)

    @spawn(placement=cpu(0))
    async def inner_part():
        partial_sums = np.empty(divisions)
        async with finish():
            for i in range(divisions):

                @spawn(placement=mapper.device(i))
                def inner_local():
                    copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])

        res = 0.
        for i in range(divisions):
            res += partial_sums[i]
        inner_result[0] = res

    end = time.perf_counter()
    print(end - start)

    assert np.allclose(np.inner(a, b), inner_result[0])
Ejemplo n.º 7
0
def main():
    devs = list(gpu.devices) + list(cpu.devices)
    if "N_DEVICES" in os.environ:
        devs = devs[:int(os.environ.get("N_DEVICES"))]
    divisions = len(devs)*2

    # Set up an "n" x "n" grid of values and run
    # "steps" number of iterations of the 4 point stencil on it.
    n = 25000
    steps = 200

    # Set up two arrays containing the input data.
    # This demo uses the standard technique of computing
    # from one array into another then swapping the
    # input and output arrays for the next iteration.
    # These are the two arrays that will be swapped back
    # and forth as input and output.
    a0 = np.random.rand(n, n)
    a1 = a0.copy()

    # An object that distributes arrays across all the given devices.
    mapper = LDeviceSequenceBlocked(divisions, devices=devs)

    # Partition a0 and a1.
    # Here we just partition the rows across the different devices.
    # Other partitioning schemes are possible.
    a0_row_groups = mapper.partition_tensor(a0, overlap=1)
    a1_row_groups = mapper.partition_tensor(a1, overlap=1)

    # Trigger JIT
    @spawn(placement=cpu(0))
    async def warmups():
        warmup = TaskSpace()
        for i in range(divisions):
            @spawn(warmup[i], placement=mapper.device(i))
            async def w():
                jacobi(a1_row_groups[i], a0_row_groups[i])
                cupy.cuda.get_current_stream().synchronize()
                cupy.cuda.Stream.null.synchronize()
        await warmup

    time.sleep(5)

    start = time.perf_counter()
    # Main parla task.
    @spawn(placement=cpu(0))
    async def run_jacobi():
        assert steps > 0
        # Specify which set of blocks is used as input or output
        # (they will be swapped for each iteration).
        in_blocks = a0_row_groups
        out_blocks = a1_row_groups
        # Create a set of labels for the tasks that perform the first
        # Jacobi iteration step.
        previous_block_tasks = CompletedTaskSpace()
        # Now create the tasks for subsequent iteration steps.
        for i in range(steps):
            # Swap input and output blocks for the next step.
            in_blocks, out_blocks = out_blocks, in_blocks
            # Create a new set of labels for the tasks that do this iteration step.
            current_block_tasks = TaskSpace("block_tasks[{}]".format(i))
            # Create the tasks to do the i'th iteration.
            # As before, each task needs the following info:
            #  a block index "j"
            #  a "device" where it should execute (supplied by mapper used for partitioning)
            #  the "in_block" of data used as input
            #  the "out_block" to write the output to
            for j in range(divisions):
                device = mapper.device(j)
                in_block = in_blocks[j]
                out_block = out_blocks[j]
                # Make each task operating on each block depend on the tasks for
                # that block and its immediate neighbors from the previous iteration.
                @spawn(current_block_tasks[j],
                       dependencies=[previous_block_tasks[max(0, j-1):min(divisions, j+2)]],
                       placement=device)
                def device_local_jacobi_task():
                    # Read boundary values from adjacent blocks in the partition.
                    # This may communicate across device boundaries.
                    if j > 0:
                        copy(in_block[0], in_blocks[j - 1][-2])
                    if j < divisions - 1:
                        copy(in_block[-1], in_blocks[j + 1][1])
                    # Run the computation, dispatching to device specific code.
                    jacobi(in_block, out_block)
            # For the next iteration, use the newly created tasks as
            # the tasks from the previous step.
            previous_block_tasks = current_block_tasks
        await previous_block_tasks
        cupy.cuda.get_current_stream().synchronize()
        cupy.cuda.Stream.null.synchronize()
        end = time.perf_counter()
        print(end - start)

        # This depends on all the tasks from the last iteration step.
        for j in range(divisions):
            start_index = 1 if j > 0 else 0
            end_index = -1 if j < divisions - 1 else None  # None indicates the last element of the dimension
            copy(a1[mapper.slice(j, len(a1))], out_blocks[j][start_index:end_index])
Ejemplo n.º 8
0
async def tsqr_blocked_puregpu(A, block_size):
    Q1 = [None] * NGPUS
    R1 = PartitionedTensor(
        [None] * NGPUS
    )  # CAVEAT: PartitionedTensor with None holes can be fragile! Be cautious!

    # Create tasks to perform qr factorization on each block and store them in lists
    t1_tot_start = time()
    T1 = TaskSpace()
    for i in range(NGPUS):

        @spawn(taskid=T1[i],
               placement=A.base[i])  # NB: A[i] dumbly moves the block here!
        def t1():
            #print("t1[", i, "] start on ", get_current_devices(), sep='', flush=True)

            perf_stats.t1_is_GPU_tasks[i] = True
            t1_ker_start = time()
            Q1[i], R1[i] = cp.linalg.qr(A[i])
            R1[i] = R1[i].flatten()
            t1_ker_end = time()
            perf_stats.t1_ker_tasks[i] = t1_ker_end - t1_ker_start
            A[i] = None  # Free up memory

            #print("t1[", i, "] end on ", get_current_devices(),  sep='', flush=True)

    await t1
    t1_tot_end = time()
    perf_stats.t1_tot = t1_tot_end - t1_tot_start

    # Perform intermediate qr factorization on R1 to get Q2 and final R
    t2_tot_start = time()

    @spawn(dependencies=T1, placement=gpu)
    def t2():
        #print("\nt2 start", flush=True)

        # Gather to this device
        t2_D2D_start = time()
        R1_reduced = np.empty(shape=(0, NCOLS))
        for dev in range(NGPUS):
            next = R1[dev]
            next = next.reshape(NCOLS, NCOLS)
            R1_reduced = cp.vstack((R1_reduced, next))
            R1[dev] = None  # Free up memory
        t2_D2D_end = time()
        perf_stats.t2_D2D = t2_D2D_end - t2_D2D_start

        # R here is the final R result
        t2_ker_start = time()
        Q2, R = cp.linalg.qr(R1_reduced)
        Q2 = Q2.flatten()
        t2_ker_end = time()
        perf_stats.t2_ker = t2_ker_end - t2_ker_start

        return Q2, R

    Q2, R = await t2
    t2_tot_end = time()
    perf_stats.t2_tot = t2_tot_end - t2_tot_start
    #print("t2 end\n", flush=True)

    mapper = LDeviceSequenceBlocked(NGPUS, placement=Q2)
    Q2p = mapper.partition_tensor(Q2)
    Q = [None] * NGPUS
    t3_tot_start = time()
    # Create tasks to perform Q1 @ Q2 matrix multiplication by block
    T3 = TaskSpace()
    for i in range(NGPUS):

        @spawn(taskid=T3[i], dependencies=[T1[i], t2], placement=Q1[i])
        def t3():
            #print("t3[", i, "] start on ", get_current_devices(), sep='', flush=True)
            perf_stats.t3_is_GPU_tasks[i] = True

            # Copy the data to the processor
            # Q1 and Q2 must have an equal number of blocks, where Q1 blocks' ncols = Q2 blocks' nrows
            # Q2 is currently an (ncols * nblocks) x ncols matrix. Need nblocks of ncols rows each
            t3_H2D_start = time()
            Q2_local = Q2p[i]
            Q2_local = Q2_local.reshape(NCOLS, NCOLS)
            t3_H2D_end = time()
            perf_stats.t3_H2D_tasks[i] = t3_H2D_end - t3_H2D_start

            # Run the kernel. (Data is copied back within this call; timing annotations are added there)
            t3_ker_start = time()
            Q[i] = cp.matmul(Q1[i], Q2_local)
            t3_ker_end = time()
            perf_stats.t3_ker_tasks[i] = t3_ker_end - t3_ker_start

            #print("t3[", i, "] end on ", get_current_devices(), sep='', flush=True)

    await T3
    t3_tot_end = time()
    perf_stats.t3_tot = t3_tot_end - t3_tot_start
    return Q, R
Ejemplo n.º 9
0
async def tsqr_blocked(A, block_size):

    nrows, ncols = A.shape

    # Check for block_size > ncols
    assert ncols <= block_size, "Block size must be greater than or equal to the number of columns in the input matrix"

    # Calculate the number of blocks
    nblocks = (nrows + block_size - 1) // block_size  # ceiling division
    mapper = LDeviceSequenceBlocked(nblocks, placement=A)
    A_blocked = mapper.partition_tensor(A)  # Partition A into blocks

    # Initialize and partition empty array to store blocks (same partitioning scheme, share the mapper)
    Q1_blocked = mapper.partition_tensor(np.empty_like(A))
    R1 = np.empty([nblocks * ncols, ncols])  # Concatenated view
    # Q2 is allocated in t2
    Q = np.empty([nrows, ncols])  # Concatenated view

    # Create tasks to perform qr factorization on each block and store them in lists
    t1_tot_start = time()
    T1 = TaskSpace()
    for i in range(nblocks):
        # Block view to store Q1 not needed since it's not contiguous

        # Get block view to store R1
        R1_lower = i * ncols
        R1_upper = (i + 1) * ncols

        T1_MEMORY = None
        if PLACEMENT_STRING == 'gpu' or PLACEMENT_STRING == 'both':
            T1_MEMORY = int(
                4.2 *
                A_blocked[i:i +
                          1].nbytes)  # Estimate based on empirical evidence

        @spawn(taskid=T1[i], placement=PLACEMENT, memory=T1_MEMORY, vcus=ACUS)
        def t1():
            #print("t1[", i, "] start on ", get_current_devices(), sep='', flush=True)

            # Copy the data to the processor
            t1_H2D_start = time()
            A_block_local = A_blocked[i:i + 1]
            cp.cuda.get_current_stream().synchronize()
            t1_H2D_end = time()
            perf_stats.t1_H2D_tasks[i] = t1_H2D_end - t1_H2D_start

            # Run the kernel. (Data is copied back within this call; timing annotations are added there)
            Q1_blocked[i], R1[R1_lower:R1_upper] = qr_block(A_block_local, i)

            #print("t1[", i, "] end on ", get_current_devices(),  sep='', flush=True)

    await t1
    t1_tot_end = time()
    perf_stats.t1_tot = t1_tot_end - t1_tot_start

    # Perform intermediate qr factorization on R1 to get Q2 and final R
    t2_tot_start = time()

    @spawn(dependencies=T1, placement=cpu)
    def t2():
        #print("\nt2 start", flush=True)

        # R here is the final R result
        # This step could be done recursively, but for small column counts that's not necessary
        Q2, R = np.linalg.qr(R1)

        # Q1 and Q2 must have an equal number of blocks, where Q1 blocks' ncols = Q2 blocks' nrows
        # Q2 is currently an (ncols * nblocks) x ncols matrix. Need nblocks of ncols rows each
        return Q2, R

    Q2, R = await t2
    t2_tot_end = time()
    perf_stats.t2_tot = t2_tot_end - t2_tot_start
    #print("t2 end\n", flush=True)

    # Partition Q2 (same partitioning scheme, share the mapper)
    Q2_blocked = mapper.partition_tensor(Q2)
    t3_tot_start = time()
    # Create tasks to perform Q1 @ Q2 matrix multiplication by block
    T3 = TaskSpace()
    for i in range(nblocks):
        # Q1 is already in blocks

        # Get block view to store Q
        Q_lower = i * block_size  # first row in block, inclusive
        Q_upper = (i + 1) * block_size  # last row in block, exclusive

        T3_MEMORY = None
        if PLACEMENT_STRING == 'gpu' or PLACEMENT_STRING == 'both':
            T3_MEMORY = 4 * Q1_blocked[i].nbytes  # # This is a guess

        @spawn(taskid=T3[i],
               dependencies=[T1[i], t2],
               placement=PLACEMENT,
               memory=T3_MEMORY,
               vcus=ACUS)
        def t3():
            #print("t3[", i, "] start on ", get_current_devices(), sep='', flush=True)

            # Copy the data to the processor
            t3_H2D_start = time()
            Q1_block_local = Q1_blocked[i]
            Q2_block_local = Q2_blocked[i:i + 1]
            cp.cuda.get_current_stream().synchronize()
            t3_H2D_end = time()
            perf_stats.t3_H2D_tasks[i] = t3_H2D_end - t3_H2D_start

            # Run the kernel. (Data is copied back within this call; timing annotations are added there)
            Q[Q_lower:Q_upper] = matmul_block(Q1_block_local, Q2_block_local,
                                              i)

            #print("t3[", i, "] end on ", get_current_devices(), sep='', flush=True)

    await T3
    t3_tot_end = time()
    perf_stats.t3_tot = t3_tot_end - t3_tot_start
    return Q, R