Esempio n. 1
def cholesky_blocked_inplace(shape, num_gpus):
  This is a less naive version of dpotrf with one level of blocking.
  Blocks are currently assumed to evenly divide the axes lengths.
  The input array 4 dimensional. The first and second index select
  the block (row first, then column). The third and fourth index
  select the entry within the given block.
  if shape[0] * shape[2] != shape[1] * shape[3]:
    raise ValueError("A square matrix is required.")
  if shape[0] != shape[1]:
    raise ValueError("Non-square blocks are not supported.")

  # Define task spaces
  gemm1 = TaskSpace("gemm1")        # Inter-block GEMM
  subcholesky = TaskSpace("subcholesky")  # Cholesky on block
  gemm2 = TaskSpace("gemm2")        # Inter-block GEMM
  solve = TaskSpace("solve")        # Triangular solve

  for j in range(shape[0]):
    for k in range(j):
      # Inter - block GEMM
      @spawn(gemm1[j, k], [solve[j, k]], placement=[gpu(j%num_gpus)])
      def t1():
        out = get_gpu_memory(j, j, num_gpus)
        rhs = get_gpu_memory(j, k, num_gpus)
        out = update(rhs, rhs, out)
        set_gpu_memory_from_gpu(j, j, num_gpus, out)

    # Cholesky on block
    @spawn(subcholesky[j], [gemm1[j, 0:j]], placement=[gpu(j%num_gpus)])
    def t2():
      dblock = get_gpu_memory(j, j, num_gpus) 
      dblock = cholesky(dblock)
      set_gpu_memory_from_gpu(j, j, num_gpus, dblock)

    for i in range(j+1, shape[0]):
      for k in range(j):
        # Inter - block GEMM
        @spawn(gemm2[i, j, k], [solve[j, k], solve[i, k]], placement=[gpu(i%num_gpus)])
        def t3():
          out = get_gpu_memory(i, j, num_gpus)
          rhs1 = get_gpu_memory(i, k, num_gpus)
          rhs2 = get_gpu_memory(j, k, num_gpus)
          out = update(rhs1, rhs2, out)
          set_gpu_memory_from_gpu(i, j, num_gpus, out)

      # Triangular solve
      @spawn(solve[i, j], [gemm2[i, j, 0:j], subcholesky[j]], placement=[gpu(i%num_gpus)])
      def t4():
        factor = get_gpu_memory(j, j, num_gpus)
        panel  = get_gpu_memory(i, j, num_gpus)
        out = ltriang_solve(factor, panel)
        set_gpu_memory_from_gpu(i, j, num_gpus, out)

  return subcholesky[shape[0]-1]
Esempio n. 2
    async def test_tsqr_blocked(placement=cpu):
        for i in range(WARMUP + ITERS):
            # Reset all iteration-specific timers and counters

            # Original matrix
            A = np.random.rand(NROWS, NCOLS)

            if PLACEMENT_STRING == 'puregpu':
                if (NROWS % NGPUS != 0):
                    raise ValueError(
                        "Pure GPU version requires NROWS %% NGPUS == 0 (currently %i %% %i)"
                        % (NROWS, NGPUS))

                # Partition matrix on GPUs
                mapper = LDeviceSequenceBlocked(
                    NGPUS, placement=[gpu(dev) for dev in range(NGPUS)])
                A_dev = mapper.partition_tensor(A)

                tot_start = time()
                Q_dev, R_dev = await tsqr_blocked_puregpu(A_dev, BLOCK_SIZE)
                tot_end = time()

                # Copy the data back
                if CHECK_RESULT:
                    Q = np.empty(shape=(0, NCOLS))
                    for dev in range(NGPUS):
                        with cp.cuda.Device(dev):
                            Q = np.vstack((Q, cp.asnumpy(Q_dev[dev])))

                    R = cp.asnumpy(R_dev)

            else:  # Normal version
                # Run and time the algorithm
                tot_start = time()
                Q, R = await tsqr_blocked(A, BLOCK_SIZE)
                tot_end = time()

            perf_stats.tot_time = tot_end - tot_start

            # Combine task timings into totals for this iteration

            if (i >= WARMUP):
                iter = i - WARMUP
                if CSV:

            # Check the results
            if CHECK_RESULT:
                if check_result(A, Q, R):
                    print("\nCorrect result!\n")
                    print("%***** ERROR: Incorrect final result!!! *****%")
Esempio n. 3
def test_placement_await():
        from parla.cuda import gpu
    except (ImportError, AttributeError):
        skip("CUDA required for this test.")

    devices = [cpu(0), gpu(0)]

    for rep in repetitions():
        task_results = []
        for i in range(2):

            async def task():
                await tasks()  # Await nothing to force a new task.

            sleep_until(lambda: len(task_results) == (i + 1) * 2)

        assert task_results == [cpu(0), cpu(0), gpu(0), gpu(0)]
Esempio n. 4
def test_placement_data(runtime_sched):
        from parla.cuda import gpu
        skip("Test needs cuda.")
    devices = [cpu(0), gpu(0)]
    for rep in repetitions():
        task_results = []
        for (i, dev) in enumerate(devices):
            d = dev.memory()(np.array([1, 2, 3]))
            def task():
            sleep_until(lambda: len(task_results) == i+1)

        assert task_results == devices
Esempio n. 5
    NCOLS = args.cols
    BLOCK_SIZE = args.block_size
    ITERS = args.iterations
    WARMUP = args.warmup
    NTHREADS = args.threads
    NGPUS = args.ngpus
    PLACEMENT_STRING = args.placement
    CHECK_RESULT = args.check_result
    CSV = args.csv

    # Set up PLACEMENT variable
    if PLACEMENT_STRING == 'cpu':
        PLACEMENT = cpu
        ACUS = None
    elif PLACEMENT_STRING == 'gpu':
        PLACEMENT = [gpu(i) for i in range(NGPUS)]
        ACUS = None
    elif PLACEMENT_STRING == 'both':
        PLACEMENT = [cpu(0)] + [gpu(i) for i in range(NGPUS)]
        ACUS = 1
    elif PLACEMENT_STRING == 'puregpu':
        PLACEMENT = [gpu(i) for i in range(NGPUS)]
        ACUS = None
        BLOCK_SIZE = int(NROWS / NGPUS)
            "Invalid value for placement. Must be 'cpu' or 'gpu' or 'both' or 'puregpu'"

    perf_stats = perfStats(ITERS, NROWS, BLOCK_SIZE)