def start(self, A_matrix):
        """Decomposes A_matrix into two matrices L and U.

        @param A_matrix Coefficient matrix.
        @return float64[:,:], float64[:,:]
        """
        A = A_matrix.flatten()
        L = np.zeros_like(A)

        rows = len(A_matrix)
        columns = len(A_matrix)
        tpb = 32
        matrix_size = rows * columns

        with cuda.pinned(A, L):
            stream = cuda.stream()
            gpu_A = cuda.to_device(A, stream=stream)
            gpu_L = cuda.to_device(L, stream=stream)
            bpg = 1

            for i in range(0, rows):
                self.gaussian_lu_decomposition[(bpg, bpg), (tpb, tpb)](gpu_A,\
                                                                       gpu_L,\
                                                                       rows, i)

        gpu_A.copy_to_host(A, stream)
        gpu_L.copy_to_host(L, stream)

        U = A.reshape(rows, columns)
        L = L.reshape(rows, columns)
        del stream
        return L, U
    def start(self, A_matrix, b_vector):
        """Launches parallel Gauss Jordan elimination for a SLAE and returns
        its answer.

        @param A_matrix     Coefficient matrix of a SLAE.
        @param b_vector     Linearly independent vector of a SLAE.

        @return float64[:]
        """
        if 0 in A_matrix.diagonal():
            return None

        b = b_vector.reshape(len(b_vector), 1)
        A = np.hstack((A_matrix, b))
        A = A.flatten()

        n = len(b)

        with cuda.pinned(A):
            stream = cuda.stream()
            gpu_A = cuda.to_device(A, stream=stream)
            bpg = 1

            for i in range(0, n):
                self.gauss_jordan[(bpg, bpg), (tpb, tpb)](gpu_A, n, i)
                self.normalize[(bpg, bpg), (tpb, tpb)](gpu_A, n)

        gpu_A.copy_to_host(A, stream)

        x = A.reshape(n, (n + 1))[:, n]

        if True in np.isnan(x) or True in np.isinf(x):
            return None
        else:
            return x
Example #3
0
def start(A_matrix):
    A = A_matrix.flatten()
    L = np.zeros_like(A)
    U = np.zeros_like(A)

    rows = len(A_matrix)
    columns = len(A_matrix)
    tpb = 32
    n = rows * columns

    with cuda.pinned(A, L, U):
        stream = cuda.stream()
        gpu_A = cuda.to_device(A, stream=stream)
        gpu_L = cuda.to_device(L, stream=stream)
        gpu_U = cuda.to_device(U, stream=stream)
        bpg = n + (tpb - 1) // tpb

        crout[(bpg, bpg), (tpb, tpb)](gpu_A, gpu_L, gpu_U, rows)

    gpu_L.copy_to_host(L, stream)
    gpu_U.copy_to_host(U, stream)

    L = L.reshape(rows, columns)
    U = U.reshape(rows, columns)

    print(L)
    print(U)
    print(np.matmul(L, U))
Example #4
0
 def test_pinned(self):
     A = np.arange(2*1024*1024) # 16 MB
     total = 0
     with cuda.pinned(A):
         for i in range(REPEAT):
             total += self._template('pinned', A)
     print('pinned', total / REPEAT)
Example #5
0
 def test_pinned(self):
     A = np.arange(2 * 1024 * 1024)  # 16 MB
     total = 0
     with cuda.pinned(A):
         for i in range(REPEAT):
             total += self._template('pinned', A)
     print('pinned', total / REPEAT)
Example #6
0
 def test_pinned(self):
     machine = platform.machine()
     if machine.startswith('arm') or machine.startswith('aarch64'):
         count = 262144   # 2MB
     else:
         count = 2097152  # 16MB
     A = np.arange(count)
     with cuda.pinned(A):
         self._run_copies(A)
Example #7
0
    def test_pinned_contextmanager(self):
        # Check that temporarily pinned memory is unregistered immediately,
        # such that it can be re-pinned at any time
        class PinnedException(Exception):
            pass

        arr = np.zeros(1)
        ctx = cuda.current_context()
        ctx.deallocations.clear()
        with self.check_ignored_exception(ctx):
            with cuda.pinned(arr):
                pass
            with cuda.pinned(arr):
                pass
            # Should also work inside a `defer_cleanup` block
            with cuda.defer_cleanup():
                with cuda.pinned(arr):
                    pass
                with cuda.pinned(arr):
                    pass
            # Should also work when breaking out of the block due to an exception
            try:
                with cuda.pinned(arr):
                    raise PinnedException
            except PinnedException:
                with cuda.pinned(arr):
                    pass
    def test_pinned_contextmanager(self):
        # Check that temporarily pinned memory is unregistered immediately,
        # such that it can be re-pinned at any time
        class PinnedException(Exception):
            pass

        arr = np.zeros(1)
        ctx = cuda.current_context()
        ctx.deallocations.clear()
        with self.check_ignored_exception(ctx):
            with cuda.pinned(arr):
                pass
            with cuda.pinned(arr):
                pass
            # Should also work inside a `defer_cleanup` block
            with cuda.defer_cleanup():
                with cuda.pinned(arr):
                    pass
                with cuda.pinned(arr):
                    pass
            # Should also work when breaking out of the block due to an exception
            try:
                with cuda.pinned(arr):
                    raise PinnedException
            except PinnedException:
                with cuda.pinned(arr):
                    pass
Example #9
0
    def _send_arrays_to_gpu(self, arrays_to_transfer, n_gpu_arrays_needed):

        gpu_arrays = []
        stream = cuda.stream()
        self.streams.append(stream)
        with cuda.pinned(*arrays_to_transfer):
            for arr in arrays_to_transfer:
                try:
                    gpu_array = cuda.to_device(arr, stream)
                except cuda.cudadrv.driver.CudaAPIError:
                    print_memory_info_after_transfer_failure(arr, n_gpu_arrays_needed)
                    self.clear_cuda_memory(gpu_arrays)
                    return []
                gpu_arrays.append(gpu_array)
        return gpu_arrays
Example #10
0
def corr_FD(x1, x2):
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(x1.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Trigger initialization the cuFFT system.
    # This takes significant time for small dataset.
    # We should not be including the time wasted here

    #ft.FFTPlan(shape=x1.shape, itype=np.float32, otype=np.complex64)

    X1 = x1.astype(np.float32)
    X2 = x2.astype(np.float32)

    stream1 = cuda.stream()
    stream2 = cuda.stream()

    fftplan1 = ft.FFTPlan(shape=x1.shape,
                          itype=np.float32,
                          otype=np.complex64,
                          stream=stream1)
    fftplan2 = ft.FFTPlan(shape=x2.shape,
                          itype=np.float32,
                          otype=np.complex64,
                          stream=stream2)

    # pagelock memory
    with cuda.pinned(X1, X2):

        # We can overlap the transfer of response_complex with the forward FFT
        # on image_complex.
        d_X1 = cuda.to_device(X1, stream=stream1)
        d_X2 = cuda.to_device(X2, stream=stream2)

        fftplan1.forward(d_X1, out=d_X1)
        fftplan2.forward(d_X2, out=d_X2)
        print('d_X1 is ', np.shape(d_X1), type(d_X1), np.max(d_X1))
        print('d_X2 is ', np.shape(d_X2), type(d_X2), np.max(d_X2))

        stream2.synchronize()

        mult_inplace[blockpergrid, threadperblock, stream1](d_X1, d_X2)
        fftplan1.inverse(d_X1, out=d_X1)

        # implicitly synchronizes the streams
        c = d_X1.copy_to_host().real / np.prod(x1.shape)

    return c
def driver(pricer, pinned=False):
    paths = np.zeros((NumPath, NumStep + 1), order='F')
    paths[:, 0] = StockPrice
    DT = Maturity / NumStep

    if pinned:
        from numba import cuda
        with cuda.pinned(paths):
            ts = timer()
            pricer(paths, DT, InterestRate, Volatility)
            te = timer()
    else:
        ts = timer()
        pricer(paths, DT, InterestRate, Volatility)
        te = timer()

    ST = paths[:, -1]
    PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0)
    print('Result')
    fmt = '%20s: %s'
    print(fmt % ('stock price', np.mean(ST)))
    print(fmt % ('standard error', np.std(ST) / sqrt(NumPath)))
    print(fmt % ('paid off', np.mean(PaidOff)))
    optionprice = np.mean(PaidOff) * exp(-InterestRate * Maturity)
    print(fmt % ('option price', optionprice))

    print('Performance')
    NumCompute = NumPath * NumStep
    print(fmt % ('Mstep/second', '%.2f' % (NumCompute / (te - ts) / 1e6)))
    print(fmt % ('time elapsed', '%.3fs' % (te - ts)))

    if '--plot' in sys.argv:
        from matplotlib import pyplot
        pathct = min(NumPath, 100)
        for i in xrange(pathct):
            pyplot.plot(paths[i])
        print('Plotting %d/%d paths' % (pathct, NumPath))
        pyplot.show()
Example #12
0
def driver(pricer, pinned=False):
    paths = np.zeros((NumPath, NumStep + 1), order='F')
    paths[:, 0] = StockPrice
    DT = Maturity / NumStep

    if pinned:
        from numba import cuda
        with cuda.pinned(paths):
            ts = timer()
            pricer(paths, DT, InterestRate, Volatility)
            te = timer()
    else:
        ts = timer()
        pricer(paths, DT, InterestRate, Volatility)
        te = timer()

    ST = paths[:, -1]
    PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0)
    print('Result')
    fmt = '%20s: %s'
    print(fmt % ('stock price', np.mean(ST)))
    print(fmt % ('standard error', np.std(ST) / sqrt(NumPath)))
    print(fmt % ('paid off', np.mean(PaidOff)))
    optionprice = np.mean(PaidOff) * exp(-InterestRate * Maturity)
    print(fmt % ('option price', optionprice))

    print('Performance')
    NumCompute = NumPath * NumStep
    print(fmt % ('Mstep/second', '%.2f' % (NumCompute / (te - ts) / 1e6)))
    print(fmt % ('time elapsed', '%.3fs' % (te - ts)))

    if '--plot' in sys.argv:
        from matplotlib import pyplot
        pathct = min(NumPath, 100)
        for i in xrange(pathct):
            pyplot.plot(paths[i])
        print('Plotting %d/%d paths' % (pathct, NumPath))
        pyplot.show()
Example #13
0
def driver(MCUDA, pinned=False):
    paths = np.zeros((Number_of_Paths, Number_of_Steps + 1), order='F')
    paths[:, 0] = SP
    Delta_T = Maturity / Number_of_Steps

    if pinned:
        with cuda.pinned(paths):
            time_s = timer()
            MCUDA(paths, Delta_T, IR, Beta)
            time_a = timer()
    else:
        time_s = timer()
        MCUDA(paths, Delta_T, IR, Beta)
        time_a = timer()

    stk = paths[:, -1]
    pOff = np.maximum(paths[:, -1] - K, 0)
    o_Price = np.mean(pOff) * exp(Maturity * -IR)
    print('error ', np.std(stk) / sqrt(Number_of_Paths))
    print('payoff ', np.mean(pOff))
    print('Option Pice', o_Price)
    print('Run Time')
    print(time_a - time_s)
Example #14
0
    def start(self, A_matrix, b_matrix):
        """Launches parallel Gaussian elimination for a SLAE and returns its answer.

        @param A_matrix   Coefficient matrix of a SLAE.
        @param b_matrix   Linearly independent vector of a SLAE.

        @return None
        """
        if 0 in A_matrix.diagonal():
            return None

        b = b_matrix.reshape(len(b_matrix), 1)
        A = np.hstack((A_matrix, b))
        A = A.flatten()

        n = len(b)

        with cuda.pinned(A):
            stream = cuda.stream()
            gpu_A = cuda.to_device(A, stream=stream)
            bpg = 1

            for i in range(0, n):
                self.gaussian_elimination[(bpg, bpg), (tpb, tpb)](gpu_A, n, i)

        gpu_A.copy_to_host(A, stream)

        # Restore A and b from augmented matrix Ab
        b = A.reshape(n, (n + 1))[:, n]
        A = A.reshape(n, (n + 1))[..., :-1]

        x = substitution.back_substitution(A, b)

        if True in np.isnan(x) or True in np.isinf(x):
            return None
        else:
            return x
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print('regular', te - ts)

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print('pinned', te - ts)

assert np.allclose(dst, src)
Example #16
0
 def test_pinned(self):
     A = np.arange(2 * 1024 * 1024)  # 16 MB
     with cuda.pinned(A):
         self._run_copies(A)
Example #17
0
def main():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    # Build Image
    try:
        filename = sys.argv[1]
        image = ndimage.imread(filename, flatten=True).astype(np.float32)
    except IndexError:
        image = misc.lena().astype(np.float32)

    print("Image size: %s" % (image.shape, ))

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print('CPU: %.2fs' % (te - ts))

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Trigger initialization the cuFFT system.
    # This takes significant time for small dataset.
    # We should not be including the time wasted here
    FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    stream1 = cuda.stream()
    stream2 = cuda.stream()

    fftplan1 = FFTPlan(shape=image.shape,
                       itype=np.complex64,
                       otype=np.complex64,
                       stream=stream1)
    fftplan2 = FFTPlan(shape=image.shape,
                       itype=np.complex64,
                       otype=np.complex64,
                       stream=stream2)

    # pagelock memory
    with cuda.pinned(image_complex, response_complex):

        # We can overlap the transfer of response_complex with the forward FFT
        # on image_complex.
        d_image_complex = cuda.to_device(image_complex, stream=stream1)
        d_response_complex = cuda.to_device(response_complex, stream=stream2)

        fftplan1.forward(d_image_complex, out=d_image_complex)
        fftplan2.forward(d_response_complex, out=d_response_complex)

        stream2.synchronize()

        mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex,
                                                            d_response_complex)
        fftplan1.inverse(d_image_complex, out=d_image_complex)

        # implicitly synchronizes the streams
        cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(
            image.shape)

    te = timer()
    print('GPU: %.2fs' % (te - ts))

    # Plot the results
    plt.subplot(1, 2, 1)
    plt.title('CPU')
    plt.imshow(cvimage_cpu, cmap=plt.cm.gray)
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.title('GPU')
    plt.imshow(cvimage_gpu, cmap=plt.cm.gray)
    plt.axis('off')

    plt.show()
def main():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    # Build Image
    try:
        filename = sys.argv[1]
        image = ndimage.imread(filename, flatten=True).astype(np.float32)
    except IndexError:
        image = misc.face(gray=True).astype(np.float32)

    print("Image size: %s" % (image.shape,))

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print('CPU: %.2fs' % (te - ts))

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Trigger initialization the cuFFT system.
    # This takes significant time for small dataset.
    # We should not be including the time wasted here
    FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    stream1 = cuda.stream()
    stream2 = cuda.stream()

    fftplan1 = FFTPlan(shape=image.shape, itype=np.complex64,
                       otype=np.complex64, stream=stream1)
    fftplan2 = FFTPlan(shape=image.shape, itype=np.complex64,
                       otype=np.complex64, stream=stream2)

    # pagelock memory
    with cuda.pinned(image_complex, response_complex):

        # We can overlap the transfer of response_complex with the forward FFT
        # on image_complex.
        d_image_complex = cuda.to_device(image_complex, stream=stream1)
        d_response_complex = cuda.to_device(response_complex, stream=stream2)

        fftplan1.forward(d_image_complex, out=d_image_complex)
        fftplan2.forward(d_response_complex, out=d_response_complex)

        stream2.synchronize()

        mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex,
                                                            d_response_complex)
        fftplan1.inverse(d_image_complex, out=d_image_complex)

        # implicitly synchronizes the streams
        cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print('GPU: %.2fs' % (te - ts))

    # Plot the results
    plt.subplot(1, 2, 1)
    plt.title('CPU')
    plt.imshow(cvimage_cpu, cmap=plt.cm.gray)
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.title('GPU')
    plt.imshow(cvimage_gpu, cmap=plt.cm.gray)
    plt.axis('off')

    plt.show()
Example #19
0
 def test_pinned(self):
     A = np.arange(2*1024*1024) # 16 MB
     with cuda.pinned(A):
         self._run_copies(A)
Example #20
0
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print('regular', te - ts)

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print('pinned', te - ts)

assert np.allclose(dst, src)
Example #21
0
def train_tree(dataset,
               max_depth=6,
               min_samples_per_node=1,
               tpb=1024,
               nstreams=3):
    """
    Entrena un árbol con nuestro algoritmo para GPU basado en CUDT.
    :param dataset Conjunto de datos con el que entrenar el árbol.
    :param max_depth Profundida máxima del árbol.
    :param min_samples_per_node Mínimo de elementos en un nodo para
           seguir siendo evaluado.
    :param tpb Hebras por bloque CUDA.
    :pre dataset ha de ser un array de Numpy con todas las variables
         de tipo np.float32 y la última columna las etiquetas. Dichas
         etiquetas han de corresponderse a una clasificación binaria [0,1].
         tpb ha de ser un valor válido para bloques unidimensionales de CUDA.
    """

    N, d = dataset.shape
    d -= 1

    values = cuda.device_array((d, N), dtype=np.float32)
    labels = cuda.device_array((d, N), dtype=np.int32)
    # 1.1 Generamos las listas de atributos y las ponemos en orden
    #     ascendente de valor

    blocks = N // tpb + 1
    dataset = np.ascontiguousarray(dataset.T)

    streams = [cuda.stream() for i in range(3)]

    d_scan = cuda.device_array((d, N), np.float32, stream=streams[2])

    best_flag = cuda.device_array(N, np.bool, stream=streams[0])
    my_flag = cuda.device_array((d, N), np.bool, stream=streams[1])
    buffer_int = cuda.device_array((d, N), np.int32, stream=streams[2])
    buffer_int2 = cuda.device_array((d, N), np.int32, stream=streams[0])

    locks = [
        cuda.device_array(1, np.int32, stream=stream) for stream in streams
    ]
    locks[:][0] = 0
    my_mins = [
        cuda.device_array(1, np.float32, stream=stream) for stream in streams
    ]
    my_min_idxs = [
        cuda.device_array(2, np.int32, stream=stream) for stream in streams
    ]
    my_totals = [
        cuda.device_array(1, dtype=np.int32, stream=stream)
        for stream in streams
    ]

    address = cuda.device_array((d, N), np.int32, stream=streams[2])
    set_range[(N // tpb + 1, d), tpb, streams[2]](address)
    with cuda.pinned(dataset):
        d_labels = cuda.to_device(dataset[-1], stream=streams[0])
        d_dataset = cuda.to_device(dataset[:-1], stream=streams[1])

        indexes = cp.argsort(cp.array(dataset[:-1]), axis=1)
        indexes = cuda.to_device(indexes, stream=0)

        fill_2d[d * N // tpb + 1, tpb, streams[0]](values, indexes, d_dataset)
        fill_2d_label[d * N // tpb + 1, tpb, streams[1]](labels, indexes,
                                                         d_labels)

    cuda.synchronize()

    # 1.2 Generamos el nodo inicial
    outputs = []
    ActiveNode = collections.namedtuple('ActiveNode', 'idx start end')

    start_node = ActiveNode(idx=0, start=0, end=N)
    active_list = [start_node]

    # 2. Recorremos los niveles de profundidad
    for current_depth in range(max_depth):
        best_flag[:] = True
        level = {}
        next_active_list = []

        # 2.1 Buscamos split points
        for i, node in enumerate(active_list):
            n = node.end - node.start
            s = node.start
            e = node.end
            node_tpb = min(max(32, 2**math.ceil(math.log2(n))), tpb)
            id_stream = i % 3
            my_stream = streams[id_stream]
            # Criterio de Poda: Mínimo de elementos en hoja o último nivel de profundidad.
            if n == 1:
                level[node.idx] = (False, values[0, node.start])
                continue
            elif n <= min_samples_per_node or current_depth == max_depth - 1:
                my_totals[id_stream][0] = 0
                utils.warp_based_reduce_sum[n // node_tpb + 1, node_tpb,
                                            my_stream](
                                                labels[0, node.start:node.end],
                                                my_totals[id_stream])
                my_total = my_totals[id_stream].copy_to_host(
                    stream=my_stream)[0]
                label = 0 if my_total / n <= 0.5 else 1
                level[node.idx] = (False, label)
                continue
            else:
                # Realizamos el scan de los labels
                aux = cuda.device_array((d, n // node_tpb + 1),
                                        dtype=np.float32,
                                        stream=my_stream)
                aux[:] = 0

                my_total = utils.multi_scan_with_gini(labels[:, s:e],
                                                      d_scan[:,
                                                             s:e], values[:,
                                                                          s:e],
                                                      my_mins[id_stream], aux,
                                                      node_tpb, my_stream)
                if my_total == 0 or my_total == n:
                    level[node.idx] = (False, my_total)
                    aux[:, :] = 0
                    continue

                blocks = (n // node_tpb + 1, d)
                utils.min_index_reduction[blocks, node_tpb,
                                          my_stream](d_scan[:, s:e],
                                                     my_mins[id_stream],
                                                     my_min_idxs[id_stream],
                                                     aux, locks[id_stream])
                my_host_idx = my_min_idxs[id_stream].copy_to_host(
                    stream=my_stream)
                my_attr_list = my_host_idx[0]
                my_index = my_host_idx[1]

                # Ponemos a False (0) en Best Flag los atributos que quedan reorganizados a la izquierda.
                set_best_flag[n // node_tpb + 1, node_tpb, my_stream](
                    best_flag, indexes[my_attr_list,
                                       node.start:node.start + my_index + 1])
                set_my_flag[(n // tpb + 1, d), tpb,
                            my_stream](my_flag[:, s:e], buffer_int[:, s:e],
                                       best_flag, indexes[:, s:e])
                utils.multi_scan_with_address(buffer_int[:, s:e],
                                              d_scan[:, s:e], address[:, s:e],
                                              my_flag[:, s:e], aux, node.start,
                                              node_tpb, my_stream)

                # Añadimos el nuevo nodo a la salida del árbol.
                my_values = values[my_attr_list,
                                   node.start + my_index:node.start +
                                   my_index + 2].copy_to_host(stream=my_stream)
                the_value = (my_values[0] + my_values[1]) / 2
                level[node.idx] = (True, my_attr_list, my_index, the_value)

                # Añadimos a la lista de pendientes los nuevos nodos generados.
                left_node = ActiveNode(idx=2 * node.idx,
                                       start=node.start,
                                       end=node.start + my_index + 1)
                right_node = ActiveNode(idx=2 * node.idx + 1,
                                        start=node.start + my_index + 1,
                                        end=node.end)

                next_active_list.append(left_node)
                next_active_list.append(right_node)

        # Añadimos el nivel del árbol a la salida
        cuda.synchronize()
        outputs.append(level)

        if current_depth == max_depth - 1:
            return outputs

        # 2.2 Reorganizamos las listas de atributos

        fill_buffer[(N // tpb + 1, d), tpb, streams[0]](d_scan, values)
        fill_buffer[(N // tpb + 1, d), tpb, streams[1]](buffer_int, indexes)
        fill_buffer[(N // tpb + 1, d), tpb, streams[2]](buffer_int2, labels)
        fill_2d_b[d * N // tpb + 1, tpb, streams[0]](values, address, d_scan)
        fill_2d_b[d * N // tpb + 1, tpb, streams[1]](indexes, address,
                                                     buffer_int)

        fill_2d_b[d * N // tpb + 1, tpb, streams[2]](labels, address,
                                                     buffer_int2)

        # 2.3 Cambiamos la lista de nodos activos a la del siguiente nivel
        active_list = next_active_list
        cuda.synchronize()

    return outputs