Ejemplo n.º 1
0
def split_lattices(lattices, lcoor, gcoor, split_grid, N):
    # N is desired number of parallel split lattices per unsplit lattice
    # 1 <= N <= sranks, sranks % N == 0
    assert len(lcoor) == len(gcoor)
    n = len(lattices)
    assert n > 0
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)])
    cb = lattices[0].checkerboard()
    assert all([lattices[i].checkerboard() is cb for i in range(1, n)])
    otype = lattices[0].otype
    assert all(
        [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)])
    assert n % N == 0
    Q = n // N
    l = [gpt.lattice(split_grid, otype) for i in range(N)]
    for x in l:
        x.checkerboard(cb)
        x.split_lcoor = lcoor
        x.split_gcoor = gcoor
    sranks = split_grid.sranks
    srank = split_grid.srank
    for i in range(Q):
        if i == srank // (sranks // Q):
            lc = lcoor
            gc = gcoor
        else:
            lc = numpy.empty(shape=(0, split_grid.nd), dtype=numpy.int32)
            gc = lc
        gpt.poke(l, lc, gpt.peek(lattices[i * N:(i + 1) * N], gc))
    return l
Ejemplo n.º 2
0
def unsplit(first, second):
    if type(first) != list:
        return unsplit([first], [second])

    n = len(first)
    N = len(second)
    split_grid = second[0].grid
    sranks = split_grid.sranks
    srank = split_grid.srank
    Q = n // N
    assert n % N == 0

    lcoor = second[0].split_lcoor
    gcoor = second[0].split_gcoor
    for i in range(Q):
        if i == srank // (sranks // Q):
            lc = lcoor
            gc = gcoor
        else:
            lc = numpy.empty(shape=(0, split_grid.nd), dtype=numpy.int32)
            gc = lc
        gpt.poke(first[i * N:(i + 1) * N], gc, gpt.peek(second, lc))
Ejemplo n.º 3
0
def merge(lattices, dimension=-1, N=-1):

    # if only one lattice is given, return immediately
    if type(lattices) != list:
        return lattices

    # number of lattices
    n = len(lattices)
    assert n > 0

    # number of batches
    if N == -1:
        N = n
    batches = n // N
    assert n % N == 0

    # all grids need to be the same
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)])

    # allow negative indexing
    if dimension < 0:
        dimension += grid.nd + 1
        assert dimension >= 0
    else:
        assert dimension <= grid.nd

    # infer checkerboarding of new dimension
    cb = [x.checkerboard() for x in lattices]
    if cb[0] is gpt.none:
        assert all([x is gpt.none for x in cb[1:]])
        cb_mask = 0
    else:
        assert all(
            [
                cb[j * N + i] is cb[j * N + i + 1].inv()
                for i in range(N - 1)
                for j in range(batches)
            ]
        )
        cb_mask = 1

    # otypes must be consistent
    otype = lattices[0].otype
    assert all([lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)])

    # create merged grid
    merged_grid = grid.inserted_dimension(dimension, N, cb_mask=cb_mask)

    # create merged lattices and set checkerboard
    merged_lattices = [gpt.lattice(merged_grid, otype) for i in range(batches)]
    for x in merged_lattices:
        x.checkerboard(cb[0])

    # coordinates of source lattices
    gcoor_zero = lattices[
        0
    ].mview_coordinates()  # return coordinates in internal ordering, speed up access
    gcoor_one = (
        lattices[1].mview_coordinates() if N > 1 and cb_mask == 1 else gcoor_zero
    )
    gcoor = [gcoor_zero, gcoor_one]

    # data transfer
    for i in range(N):
        merged_gcoor = cgpt.coordinates_inserted_dimension(gcoor[i % 2], dimension, [i])
        gpt.poke(
            merged_lattices,
            merged_gcoor,
            gpt.peek([lattices[j * N + i] for j in range(batches)], gcoor[i % 2]),
        )

    # if only one batch, remove list
    if len(merged_lattices) == 1:
        return merged_lattices[0]

    # return
    return merged_lattices
Ejemplo n.º 4
0
def load(filename, params):

    # first check if this is right file format
    if not os.path.exists(filename + "/00/0000000000.compressed"
                          ) or not os.path.exists(filename + "/metadata.txt"):
        raise NotImplementedError()

    # verbosity
    verbose = gpt.default.is_verbose("io")

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # need grids parameter
    assert "grids" in params
    assert type(params["grids"]) == gpt.grid
    fgrid = params["grids"]
    assert fgrid.precision == gpt.single
    fdimensions = fgrid.fdimensions

    # read metadata
    metadata = read_metadata(filename + "/metadata.txt")
    s = get_ivec(metadata, "s")
    ldimensions = [s[4]] + s[:4]
    blocksize = get_ivec(metadata, "b")
    blocksize = [blocksize[4]] + blocksize[:4]
    nb = get_ivec(metadata, "nb")
    nb = [nb[4]] + nb[:4]
    crc32 = get_xvec(metadata, "crc32")
    neigen = int(metadata["neig"])
    nbasis = int(metadata["nkeep"])
    nsingle = int(metadata["nkeep_single"])
    blocks = int(metadata["blocks"])
    FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"])
    nsingleCap = min([nsingle, nbasis])

    # check
    nd = len(ldimensions)
    assert nd == 5
    assert nd == len(fdimensions)
    assert nd == len(blocksize)
    assert fgrid.cb.n == 2
    assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1]

    # create coarse grid
    cgrid = gpt.block.grid(fgrid, blocksize)

    # allow for partial loading of data
    if "nmax" in params:
        nmax = params["nmax"]
        nbasis_max = min([nmax, nbasis])
        neigen_max = min([nmax, neigen])
        nsingleCap_max = min([nmax, nsingleCap])
    else:
        nbasis_max = nbasis
        neigen_max = neigen
        nsingleCap_max = nsingleCap

    # allocate all lattices
    basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)]
    cevec = [gpt.vcomplex(cgrid, nbasis_max) for i in range(neigen_max)]
    if "advise_basis" in params:
        gpt.advise(basis, params["advise_basis"])
    if "advise_cevec" in params:
        gpt.advise(cevec, params["advise_cevec"])

    # fix checkerboard of basis
    for i in range(nbasis_max):
        basis[i].checkerboard(site_cb)

    # mpi layout
    mpi = []
    for i in range(nd):
        assert fdimensions[i] % ldimensions[i] == 0
        mpi.append(fdimensions[i] // ldimensions[i])
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)

    # timing
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fread = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Loading %s with %d views per node" %
                    (filename, len(views)))
    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "rb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # dummy buffer
        data0 = memoryview(bytes())

        # single-precision data
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))
        reduced_size = len(data_munged) // block_reduce
        for b in range(read_blocks):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(block_data_size_single * nsingleCap))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_munge -= gpt.time()
                for l in range(block_reduce):
                    cgpt.munge_inner_outer(
                        data_munged[reduced_size * l:reduced_size * (l + 1)],
                        data[reduced_size * l:reduced_size * (l + 1)],
                        len(pos[b]) // block_reduce,
                        nsingleCap,
                    )
                dt_munge += gpt.time()
            else:
                data_munged = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            gpt.poke(
                basis[0:nsingleCap_max],
                pos[b],
                truncate(data_munged, nsingleCap, nsingleCap_max, len(pos[b])),
            )
            dt_distr += gpt.time()

            if verbose:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        mem_avail(),
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            reduced_size = len(data_munged) // block_reduce
            for b in range(read_blocks):
                fgrid.barrier()
                dt_fread -= gpt.time()
                if f is not None:
                    data = memoryview(
                        f.read(block_data_size_fp16 * (nbasis - nsingleCap)))
                    globalReadGB = len(data) / 1024.0**3.0
                else:
                    globalReadGB = 0.0
                globalReadGB = fgrid.globalsum(globalReadGB)
                dt_fread += gpt.time()
                totalSizeGB += globalReadGB

                if f is not None:
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp16_to_fp32(data_fp32, data, 24)
                    dt_fp16 += gpt.time()
                    dt_munge -= gpt.time()
                    for l in range(block_reduce):
                        cgpt.munge_inner_outer(
                            data_munged[reduced_size * l:reduced_size *
                                        (l + 1)],
                            data_fp32[reduced_size * l:reduced_size * (l + 1)],
                            len(pos[b]) // block_reduce,
                            nsingleCap,
                        )
                    dt_munge += gpt.time()
                else:
                    data_munged = data0

                fgrid.barrier()
                dt_distr -= gpt.time()
                if nsingleCap < nbasis_max:
                    gpt.poke(
                        basis[nsingleCap:nbasis_max],
                        pos[b],
                        truncate(
                            data_munged,
                            nbasis - nsingleCap,
                            nbasis_max - nsingleCap,
                            len(pos[b]),
                        ),
                    )
                dt_distr += gpt.time()

                if verbose:
                    gpt.message(
                        "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fread,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                            mem_avail(),
                        ))

        # coarse grid data
        data_fp32 = memoryview(bytearray(coarse_fp32_vector_size))
        for j in range(neigen):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(coarse_vector_size))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_fp16 -= gpt.time()
                cgpt.mixed_fp32fp16_to_fp32(
                    data_fp32,
                    data,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                data = data_fp32
            else:
                data = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            if j < neigen_max:
                cevec[j][pos_coarse] = truncate(data, nbasis, nbasis_max,
                                                len(pos_coarse))
            dt_distr += gpt.time()

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                        mem_avail(),
                    ))

        # crc checks
        if f is not None:
            assert crc32_comp == crc32[cv.rank]

    # timing
    t1 = gpt.time()

    # verbosity
    if verbose:
        gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))

    # eigenvalues
    evln = list(
        filter(lambda x: x != "",
               open(filename + "/eigen-values.txt").read().split("\n")))
    nev = int(evln[0])
    ev = [float(x) for x in evln[1:]]
    assert len(ev) == nev
    return (basis, cevec, ev)
Ejemplo n.º 5
0
def separate(lattices, dimension=-1):

    # expect list below
    if type(lattices) != list:
        lattices = [lattices]

    # evaluate in case it is an expression
    lattices = [gpt.eval(x) for x in lattices]

    # number of batches to separate
    batches = len(lattices)
    assert batches > 0

    # make sure all have the same grid
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, batches)])

    # allow negative indexing
    if dimension < 0:
        dimension += grid.nd
        assert dimension >= 0
    else:
        assert dimension < grid.nd

    # number of slices (per batch)
    N = grid.fdimensions[dimension]
    n = N * batches

    # all lattices need to have same checkerboard
    cb = lattices[0].checkerboard()
    assert all([lattices[i].checkerboard() is cb for i in range(1, batches)])

    # all lattices need to have same otype
    otype = lattices[0].otype
    assert all(
        [lattices[i].otype.__name__ == otype.__name__ for i in range(1, batches)]
    )

    # create grid with dimension removed
    separated_grid = grid.removed_dimension(dimension)
    cb_mask = grid.cb.cb_mask[dimension]

    # create separate lattices and set their checkerboard
    separated_lattices = [gpt.lattice(separated_grid, otype) for i in range(n)]
    for i, x in enumerate(separated_lattices):
        j = i % N
        if cb_mask == 0 or j % 2 == 0:
            x.checkerboard(cb)
        else:
            x.checkerboard(cb.inv())

    # construct coordinates
    separated_gcoor_zero = separated_lattices[0].mview_coordinates()
    separated_gcoor_one = (
        separated_lattices[1].mview_coordinates()
        if N > 1 and cb_mask == 1
        else separated_gcoor_zero
    )
    separated_gcoor = [separated_gcoor_zero, separated_gcoor_one]

    # move data
    for i in range(N):
        gcoor = cgpt.coordinates_inserted_dimension(
            separated_gcoor[i % 2], dimension, [i]
        )
        gpt.poke(
            [separated_lattices[j * N + i] for j in range(batches)],
            separated_gcoor[i % 2],
            gpt.peek(lattices, gcoor),
        )

    # return
    return separated_lattices