Beispiel #1
0
 def save(self, obj):
     if type(obj) == list:
         for o in obj:
             self.save(o)
     elif type(obj) == gpt.lattice:
         self.save(obj.mview())
     elif type(obj) == float:
         self.save(memoryview(struct.pack("d", obj)))
     elif type(obj) == complex:
         self.save(memoryview(struct.pack("dd", obj.real, obj.imag)))
     elif type(obj) == memoryview:
         self.f.seek(0, 1)
         sz = len(obj)
         szGB = sz / 1024.0**3
         self.f.write(sz.to_bytes(8, "little"))
         t0 = gpt.time()
         self.f.write(gpt.crc32(obj).to_bytes(4, "little"))
         t1 = gpt.time()
         self.f.write(obj)
         self.f.flush()
         t2 = gpt.time()
         if self.verbose:
             if self.grid is None:
                 gpt.message(
                     "Checkpoint %g GB on head node at %g GB/s for crc32 and %g GB/s for write in %g s total"
                     % (szGB, szGB / (t1 - t0), szGB / (t2 - t1), t2 - t0))
             else:
                 szGB = self.grid.globalsum(szGB)
                 gpt.message(
                     "Checkpoint %g GB at %g GB/s for crc32 and %g GB/s for write in %g s total"
                     % (szGB, szGB / (t1 - t0), szGB / (t2 - t1), t2 - t0))
     else:
         assert 0
Beispiel #2
0
def save(filename, objs, params):

    t0 = gpt.time()

    # create io
    x = gpt_io(filename, params, True)

    # create index
    f = io.StringIO("")
    x.create_index(f, "", objs)
    mvidx = memoryview(f.getvalue().encode("utf-8"))

    # write index to fs
    index_crc = gpt.crc32(mvidx)
    if gpt.rank() == 0:
        open(filename + "/index", "wb").write(mvidx)
        open(filename + "/index.crc32", "wt").write("%X\n" % index_crc)

    # close
    x.close()

    # goodbye
    if x.verbose:
        t1 = gpt.time()
        gpt.message("Completed writing %s in %g s" % (filename, t1 - t0))
Beispiel #3
0
 def write_numpy(self, a):
     if not self.glb is None:
         pos = self.glb.tell()
         buf = io.BytesIO()
         numpy.save(buf, a, allow_pickle=False)
         mv = memoryview(buf.getvalue())
         crc = gpt.crc32(mv)
         self.glb.write(crc.to_bytes(4, byteorder='little'))
         self.glb.write(mv)
         return pos, self.glb.tell()
     return 0, 0
Beispiel #4
0
 def read_numpy(self, start, end):
     if gpt.rank() == 0:
         self.glb.seek(start, 0)
         crc32_compare = int.from_bytes(self.glb.read(4),
                                        byteorder='little')
         data = self.glb.read(end - start - 4)
     else:
         data = None
         crc32_compare = None
     data = gpt.broadcast(0, data)
     crc32_computed = gpt.crc32(memoryview(data))
     if not crc32_compare is None:
         assert (crc32_computed == crc32_compare)
     return numpy.load(io.BytesIO(data))
Beispiel #5
0
    def flush(self):
        # if we read, no need to flush
        if self.index_file is None:
            return

        # get memoryview of current index
        mvidx = memoryview(self.index_file.getvalue().encode("utf-8"))

        # write index to fs
        index_crc = gpt.crc32(mvidx)
        if gpt.rank() == 0:
            f = open(self.root + "/index", "wb")
            f.write(mvidx)
            f.close()
            f = open(self.root + "/index.crc32", "wt")
            f.write("%X\n" % index_crc)
            f.close()
Beispiel #6
0
    def read_view(self, obj):
        pos = self.f.tell()
        self.f.seek(0, 2)
        flags = numpy.array([0.0, 1.0, 0.0], dtype=numpy.float64)
        t0 = gpt.time()
        if self.f.tell() != pos:
            self.f.seek(pos, 0)
            # try to read
            sz = int.from_bytes(self.f.read(8), "little")
            szGB = sz / 1024.0**3
            flags[2] = szGB
            crc32_expected = int.from_bytes(self.f.read(4), "little")
            if len(obj) == sz:
                data = self.f.read(sz)
                if len(data) == sz:
                    obj[:] = data
                    crc32 = gpt.crc32(obj)
                    if crc32 == crc32_expected:
                        flags[0] = 1.0  # flag success on this node

        # compare global
        assert self.grid is not None
        self.grid.globalsum(flags)
        t1 = gpt.time()

        # report status
        if self.verbose and flags[2] != 0.0:
            if flags[0] != flags[1]:
                gpt.message("Checkpoint %g GB failed on %g out of %g nodes" %
                            (flags[2], flags[1] - flags[0], flags[1]))
            else:
                gpt.message(
                    "Checkpoint %g GB at %g GB/s for crc32 and read combined in %g s total"
                    % (flags[2], flags[2] / (t1 - t0), t1 - t0))

        # all nodes OK?
        if flags[0] == flags[1]:
            return True

        # reset position to overwrite corruption
        self.f.seek(pos, 0)

        return False
Beispiel #7
0
def load(filename, params):

    # first check if this is right file format
    if not (os.path.exists(filename + "/index.crc32")
            and os.path.exists(filename + "/global")):
        raise NotImplementedError()

    # timing
    t0 = gpt.time()

    # create io
    x = gpt_io(filename, False, params)
    if x.verbose:
        gpt.message("Reading %s" % filename)

    # read index
    idx = open(filename + "/index", "rb").read()
    crc_expected = int(open(filename + "/index.crc32", "rt").read(), 16)
    crc_computed = gpt.crc32(memoryview(idx))
    assert crc_expected == crc_computed

    p = index_parser(idx.decode("utf-8", "strict").split("\n"))
    res = x.read_index(p)

    # if multiple chunks are available, return them as a list
    if not p.eof():
        res = [res]
        while not p.eof():
            res.append(x.read_index(p))

    # close
    x.close()

    # goodbye
    if x.verbose:
        t1 = gpt.time()
        gpt.message("Completed reading %s in %g s" % (filename, t1 - t0))

    return res
Beispiel #8
0
def load(filename, *a):

    # first check if this is right file format
    if not os.path.exists(filename + "/index.crc32"):
        raise NotImplementedError()

    # parameters
    if len(a) == 0:
        params = {}
    else:
        params = a[0]

    # timing
    t0 = gpt.time()

    # create io
    x = gpt_io(filename, params, False)
    if x.verbose:
        gpt.message("Reading %s" % filename)

    # read index
    idx = open(filename + "/index", "rb").read()
    crc_expected = int(open(filename + "/index.crc32", "rt").read(), 16)
    crc_computed = gpt.crc32(memoryview(idx))
    assert (crc_expected == crc_computed)

    p = index_parser(idx.decode("utf-8", "strict").split("\n"))
    res = x.read_index(p)

    # close
    x.close()

    # goodbye
    if x.verbose:
        t1 = gpt.time()
        gpt.message("Completed reading %s in %g s" % (filename, t1 - t0))

    return res
Beispiel #9
0
    def read_lattice(self, a):
        g_desc = a[0]
        cv_desc = a[1]
        l_desc = a[2]
        filepos = [int(x) for x in a[3:]]

        # first find grid
        if not g_desc in self.params["grids"]:
            self.params["grids"][g_desc] = gpt.grid(g_cesc)
        g = self.params["grids"][g_desc]

        # create a cartesian view and lattice to load
        l = gpt.lattice(g, l_desc)
        cv0 = gpt.cartesian_view(-1, cv_desc, g.fdimensions, g.cb,
                                 l.checkerboard())

        # find tasks for my node
        views_for_node = self.views_for_node(cv0, g)

        # performance
        dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0
        szGB = 0.0
        g.barrier()
        t0 = gpt.time()

        # need to load all views
        for xk, iview in enumerate(views_for_node):
            g.barrier()
            dt_read -= gpt.time()

            f, pos = self.open_view(xk, iview, False, cv_desc, g.fdimensions,
                                    g.cb, l.checkerboard())

            if not f is None:
                f.seek(filepos[iview], 0)
                ntag = int.from_bytes(f.read(4), byteorder='little')
                f.read(ntag)  # not needed if index is present
                crc_exp = int.from_bytes(f.read(4), byteorder='little')
                nd = int.from_bytes(f.read(4), byteorder='little')
                f.read(8 * nd)  # not needed if index is present
                sz = int.from_bytes(f.read(8), byteorder='little')
                data = memoryview(f.read(sz))
                dt_crc -= gpt.time()
                crc_comp = gpt.crc32(data)
                dt_crc += gpt.time()
                assert (crc_comp == crc_exp)
                sys.stdout.flush()
                szGB += len(data) / 1024.**3.
            else:
                assert (len(pos) == 0)
                data = None

            g.barrier()
            dt_read += gpt.time()
            dt_distr -= gpt.time()
            l[pos] = data
            g.barrier()
            dt_distr += gpt.time()

        g.barrier()
        t1 = gpt.time()

        szGB = g.globalsum(szGB)
        if self.verbose and dt_crc != 0.0:
            gpt.message(
                "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)"
                % (szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_read,
                   szGB / dt_crc, len(views_for_node)))

        # TODO:
        # split grid exposure, allow cgpt_distribute to be given a communicator
        # and take it in importexport.h, add debug info here
        # more benchmarks, useful to create a plan for cgpt_distribute and cache? immutable numpy array returned from coordinates, attach plan

        return l
Beispiel #10
0
    def write_lattice(self, ctx, l):
        g = l.grid
        tag = (ctx + "\0").encode("utf-8")
        ntag = len(tag)
        nd = len(g.fdimensions)

        # create cartesian view for writing
        if "mpi" in self.params:
            mpi = self.params["mpi"]
        else:
            mpi = g.mpi
        cv0 = gpt.cartesian_view(-1, mpi, g.fdimensions, g.cb,
                                 l.checkerboard())

        # file positions
        pos = numpy.array([0] * cv0.ranks, dtype=numpy.uint64)

        # describe
        res = g.describe() + " " + cv0.describe() + " " + l.describe()

        # find tasks for my node
        views_for_node = self.views_for_node(cv0, g)

        # performance
        dt_distr, dt_crc, dt_write = 0.0, 0.0, 0.0
        #g.barrier()
        t0 = gpt.time()
        szGB = 0.0

        # need to write all views
        for xk, iview in enumerate(views_for_node):

            f, p = self.open_view(xk, iview, True, mpi, g.fdimensions, g.cb,
                                  l.checkerboard())

            # all nodes are needed to communicate
            dt_distr -= gpt.time()
            mv = gpt.mview(l[p])
            dt_distr += gpt.time()

            # write data
            if not f is None:
                # description and data
                dt_crc -= gpt.time()
                crc = gpt.crc32(mv)
                dt_crc += gpt.time()
                dt_write -= gpt.time()
                pos[iview] = f.tell()
                f.write(ntag.to_bytes(4, byteorder='little'))
                f.write(tag)
                f.write(crc.to_bytes(4, byteorder='little'))
                f.write(nd.to_bytes(4, byteorder='little'))
                for i in range(nd):
                    f.write(g.gdimensions[i].to_bytes(4, byteorder='little'))
                for i in range(nd):
                    f.write(g.mpi[i].to_bytes(4, byteorder='little'))
                f.write(len(mv).to_bytes(8, byteorder='little'))
                f.write(mv)
                f.flush()
                dt_write += gpt.time()
                szGB += len(mv) / 1024.**3.

        t1 = gpt.time()

        szGB = g.globalsum(szGB)
        if self.verbose and dt_crc != 0.0:
            gpt.message(
                "Wrote %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for checksum, %g GB/s for writing, %d views per node)"
                % (szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_crc,
                   szGB / dt_write, len(views_for_node)))
        g.globalsum(pos)
        return res + " " + " ".join(["%d" % x for x in pos])
Beispiel #11
0
    def read_lattice(self, a):
        g_desc = a[0]
        cv_desc = a[1]
        l_desc = a[2]
        filepos = [int(x) for x in a[3:]]

        # first find grid
        if g_desc not in self.params["grids"]:
            self.params["grids"][g_desc] = gpt.grid_from_description(g_desc)
        g = self.params["grids"][g_desc]

        # create a cartesian view and lattice to load
        l = gpt.lattice(g, l_desc)
        cv0 = gpt.cartesian_view(-1, cv_desc, g.fdimensions, g.cb,
                                 l.checkerboard())

        # find tasks for my node
        views_for_node = self.views_for_node(cv0, g)

        # performance
        dt_distr, dt_crc, dt_read = 0.0, 0.0, 0.0
        szGB = 0.0
        g.barrier()
        t0 = gpt.time()

        # need to load all views
        for xk, iview in enumerate(views_for_node):
            g.barrier()
            dt_read -= gpt.time()

            f, pos = self.open_view(xk, iview, False, cv_desc, g.fdimensions,
                                    g.cb, l.checkerboard())

            cache_key = f"{a[0:3]}_{g.obj}_{iview}_read"
            if cache_key not in self.cache:
                self.cache[cache_key] = {}

            if f is not None:
                f.seek(filepos[iview], 0)
                ntag = int.from_bytes(f.read(4), byteorder="little")
                f.read(ntag)  # not needed if index is present
                crc_exp = int.from_bytes(f.read(4), byteorder="little")
                nd = int.from_bytes(f.read(4), byteorder="little")
                f.read(8 * nd)  # not needed if index is present
                sz = int.from_bytes(f.read(8), byteorder="little")
                data = memoryview(f.read(sz))
                dt_crc -= gpt.time()
                crc_comp = gpt.crc32(data)
                dt_crc += gpt.time()
                assert crc_comp == crc_exp
                sys.stdout.flush()
                szGB += len(data) / 1024.0**3.0
            else:
                assert len(pos) == 0
                data = None

            g.barrier()
            dt_read += gpt.time()
            dt_distr -= gpt.time()
            l[pos, self.cache[cache_key]] = data
            g.barrier()
            dt_distr += gpt.time()

        g.barrier()
        t1 = gpt.time()

        szGB = g.globalsum(szGB)
        if self.verbose and dt_crc != 0.0:
            gpt.message(
                "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)"
                % (
                    szGB,
                    szGB / (t1 - t0),
                    szGB / dt_distr,
                    szGB / dt_read,
                    szGB / dt_crc,
                    len(views_for_node),
                ))

        return l
Beispiel #12
0
    def read_lattice_single(self):
        if self.bytes_header < 0:
            raise

        # define grid from header
        g = gpt.grid(self.fdimensions, self.precision)
        # create lattice
        l = gpt.lattice(g, self.otype)

        # performance
        dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0
        szGB = 0.0
        crc_comp = 0
        g.barrier()
        t0 = gpt.time()
        dt_read -= gpt.time()

        pos, nreader = distribute_cartesian_file(self.fdimensions, g, l.checkerboard())

        if len(pos) > 0:
            f = gpt.FILE(self.path, "rb")
            sz = self.size * len(pos)
            f.seek(self.bytes_header + g.processor * sz, 0)
            data = memoryview(f.read(sz))
            f.close()

            dt_crc -= gpt.time()
            crc_comp = gpt.crc32(data)
            dt_crc += gpt.time()

            dt_misc -= gpt.time()
            self.swap(data)
            dt_misc += gpt.time()

            szGB += len(data) / 1024.0 ** 3.0
        else:
            data = memoryview(bytearray())

        g.barrier()
        dt_read += gpt.time()

        crc_array = numpy.array([0] * (2 * nreader), numpy.uint64)
        if g.processor < nreader:
            crc_array[2 * g.processor + 0] = sz
            crc_array[2 * g.processor + 1] = crc_comp
        g.globalsum(crc_array)
        crc_comp = 0x0
        for i in range(nreader):
            crc_comp = cgpt.util_crc32_combine(
                crc_comp, crc_array[2 * i + 1], crc_array[2 * i + 0]
            )
        crc_comp = f"{crc_comp:8X}"
        assert crc_comp == self.crc_exp

        # distributes data accordingly
        dt_distr -= gpt.time()
        l[pos] = data
        g.barrier()
        dt_distr += gpt.time()

        g.barrier()
        t1 = gpt.time()

        szGB = g.globalsum(szGB)
        if self.verbose and dt_crc != 0.0:
            gpt.message(
                "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d nreaders)"
                % (
                    szGB,
                    szGB / (t1 - t0),
                    szGB / dt_distr,
                    szGB / dt_read,
                    szGB / dt_crc,
                    nreader,
                )
            )
        return l
Beispiel #13
0
#!/usr/bin/env python3
#
# Authors: Christoph Lehner 2020
#
# Desc.: Illustrate core concepts and features
#
import gpt
import hashlib
import zlib

test = b"Test this string"

sha256_comp = "%x" % gpt.sha256(test)

m = hashlib.sha256()
m.update(test)
sha256_ref = m.hexdigest()

gpt.message(sha256_comp, sha256_ref)

assert sha256_comp == sha256_ref

crc32_comp = "%x" % gpt.crc32(test)
crc32_ref = "%x" % zlib.crc32(test)

gpt.message(crc32_comp, crc32_ref)
assert crc32_comp == crc32_ref

gpt.message("Tests successful")
Beispiel #14
0
    def read_lattice_single(self):
        if self.bytes_header < 0:
            raise

        # define grid from header
        g = gpt.grid(self.fdimensions, self.precision)
        # create lattice
        l = gpt.lattice(g, self.otype)

        # performance
        dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0
        szGB = 0.0
        crc_comp = 0
        g.barrier()
        t0 = gpt.time()

        # single file: each rank opens it and reads it all
        g.barrier()
        dt_read -= gpt.time()

        cv = gpt.cartesian_view(gpt.rank(), self.cv_desc, g.fdimensions, g.cb,
                                l.checkerboard())
        pos = gpt.coordinates(cv)

        if gpt.rank() == 0:
            f = gpt.FILE(self.path, "rb")
            f.seek(self.bytes_header, 0)
            sz = self.size * int(numpy.prod(g.fdimensions))
            data = memoryview(bytearray(f.read(sz)))
            f.close()

            dt_crc -= gpt.time()
            crc_comp = gpt.crc32(data)
            crc_comp = f"{crc_comp:8X}"
            assert crc_comp == self.crc_exp
            dt_crc += gpt.time()

            dt_misc -= gpt.time()
            self.swap(data)
            dt_misc += gpt.time()

            sys.stdout.flush()
            szGB += len(data) / 1024.0**3.0
        else:
            assert len(pos) == 0
            data = None

        g.barrier()
        dt_read += gpt.time()

        # distributes data accordingly
        dt_distr -= gpt.time()
        l[pos] = data
        g.barrier()
        dt_distr += gpt.time()

        g.barrier()
        t1 = gpt.time()

        szGB = g.globalsum(szGB)
        if self.verbose and dt_crc != 0.0:
            gpt.message(
                "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)"
                % (
                    szGB,
                    szGB / (t1 - t0),
                    szGB / dt_distr,
                    szGB / dt_read,
                    szGB / dt_crc,
                    1,
                ))
        return l
Beispiel #15
0
def load(filename, params):

    # first check if this is right file format
    if not os.path.exists(filename + "/00/0000000000.compressed"
                          ) or not os.path.exists(filename + "/metadata.txt"):
        raise NotImplementedError()

    # verbosity
    verbose = gpt.default.is_verbose("io")

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # need grids parameter
    assert params["grids"] is not None
    assert type(params["grids"]) == gpt.grid
    fgrid = params["grids"]
    assert fgrid.precision == gpt.single
    fdimensions = fgrid.fdimensions

    # read metadata
    metadata = read_metadata(filename + "/metadata.txt")
    s = get_ivec(metadata, "s")
    ldimensions = [s[4]] + s[:4]
    blocksize = get_ivec(metadata, "b")
    blocksize = [blocksize[4]] + blocksize[:4]
    nb = get_ivec(metadata, "nb")
    nb = [nb[4]] + nb[:4]
    crc32 = get_xvec(metadata, "crc32")
    neigen = int(metadata["neig"])
    nbasis = int(metadata["nkeep"])
    nsingle = int(metadata["nkeep_single"])
    blocks = int(metadata["blocks"])
    FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"])
    nsingleCap = min([nsingle, nbasis])

    # check
    nd = len(ldimensions)
    assert nd == 5
    assert nd == len(fdimensions)
    assert nd == len(blocksize)
    assert fgrid.cb.n == 2
    assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1]

    # create coarse grid
    cgrid = gpt.block.grid(fgrid, blocksize)

    # allow for partial loading of data
    if params["nmax"] is not None:
        nmax = params["nmax"]
        nbasis_max = min([nmax, nbasis])
        neigen_max = min([nmax, neigen])
        nsingleCap_max = min([nmax, nsingleCap])
    else:
        nbasis_max = nbasis
        neigen_max = neigen
        nsingleCap_max = nsingleCap

    # allocate all lattices
    basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)]
    cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)]
    if params["advise_basis"] is not None:
        gpt.advise(basis, params["advise_basis"])
    if params["advise_cevec"] is not None:
        gpt.advise(cevec, params["advise_cevec"])

    # fix checkerboard of basis
    for i in range(nbasis_max):
        basis[i].checkerboard(site_cb)

    # mpi layout
    mpi = []
    for i in range(nd):
        assert fdimensions[i] % ldimensions[i] == 0
        mpi.append(fdimensions[i] // ldimensions[i])
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)

    # timing
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fread = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Loading %s with %d views per node" %
                    (filename, len(views)))
    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "rb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2
        gpt.message("Read blocks", blocks)

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # dummy buffer
        data0 = memoryview(bytes())

        # single-precision data
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))
        for b in range(read_blocks):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(block_data_size_single * nsingleCap))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_munge -= gpt.time()
                # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB
                cgpt.munge_inner_outer(data_munged, data, nsingleCap,
                                       block_reduce)
                # data_munged: lattice0 lattice1 lattice2 ...
                dt_munge += gpt.time()
            else:
                data_munged = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            rhs = data_munged[0:block_data_size_single]
            distribute_plan = gpt.copy_plan(basis[0], rhs)
            distribute_plan.destination += basis[0].view[pos[b]]
            distribute_plan.source += gpt.global_memory_view(
                fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
            rhs = None
            distribute_plan = distribute_plan()
            for i in range(nsingleCap_max):
                distribute_plan(
                    basis[i],
                    data_munged[block_data_size_single *
                                i:block_data_size_single * (i + 1)],
                )
            dt_distr += gpt.time()

            if verbose:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        mem_avail(),
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            for b in range(read_blocks):
                fgrid.barrier()
                dt_fread -= gpt.time()
                if f is not None:
                    data = memoryview(
                        f.read(block_data_size_fp16 * (nbasis - nsingleCap)))
                    globalReadGB = len(data) / 1024.0**3.0
                else:
                    globalReadGB = 0.0
                globalReadGB = fgrid.globalsum(globalReadGB)
                dt_fread += gpt.time()
                totalSizeGB += globalReadGB

                if f is not None:
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp16_to_fp32(data_fp32, data, 24)
                    dt_fp16 += gpt.time()
                    dt_munge -= gpt.time()
                    cgpt.munge_inner_outer(
                        data_munged,
                        data_fp32,
                        nbasis - nsingleCap,
                        block_reduce,
                    )
                    dt_munge += gpt.time()
                else:
                    data_munged = data0

                fgrid.barrier()
                dt_distr -= gpt.time()
                if nsingleCap < nbasis_max:
                    rhs = data_munged[0:block_data_size_single]
                    distribute_plan = gpt.copy_plan(basis[0], rhs)
                    distribute_plan.destination += basis[0].view[pos[b]]
                    distribute_plan.source += gpt.global_memory_view(
                        fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
                    rhs = None
                    distribute_plan = distribute_plan()
                    for i in range(nsingleCap, nbasis_max):
                        j = i - nsingleCap
                        distribute_plan(
                            basis[i],
                            data_munged[block_data_size_single *
                                        j:block_data_size_single * (j + 1)],
                        )
                dt_distr += gpt.time()

                if verbose:
                    gpt.message(
                        "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fread,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                            mem_avail(),
                        ))

        # coarse grid data
        data_fp32 = memoryview(bytearray(coarse_fp32_vector_size))
        distribute_plan = None
        for j in range(neigen):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(coarse_vector_size))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_fp16 -= gpt.time()
                cgpt.mixed_fp32fp16_to_fp32(
                    data_fp32,
                    data,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                data = data_fp32
            else:
                data = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            if j < neigen_max:
                if distribute_plan is None:
                    distribute_plan = gpt.copy_plan(cevec[j], data)
                    distribute_plan.destination += cevec[j].view[pos_coarse]
                    distribute_plan.source += gpt.global_memory_view(
                        cgrid, [[cgrid.processor, data, 0, data.nbytes]])
                    distribute_plan = distribute_plan()
                distribute_plan(cevec[j], data)
            dt_distr += gpt.time()

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                        mem_avail(),
                    ))

        # crc checks
        if f is not None:
            assert crc32_comp == crc32[cv.rank]

    # timing
    t1 = gpt.time()

    # verbosity
    if verbose:
        gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))

    # eigenvalues
    evln = list(
        filter(lambda x: x != "",
               open(filename + "/eigen-values.txt").read().split("\n")))
    nev = int(evln[0])
    ev = [float(x) for x in evln[1:]]
    assert len(ev) == nev
    return (basis, cevec, ev)
Beispiel #16
0
def save(filename, objs, params):

    # split data to save
    assert len(objs) == 3
    basis = objs[0]
    cevec = objs[1]
    ev = objs[2]

    # verbosity
    verbose = gpt.default.is_verbose("io")
    if verbose:
        gpt.message(
            "Saving %d basis vectors, %d coarse-grid vectors, %d eigenvalues to %s"
            % (len(basis), len(cevec), len(ev), filename))

    # create directory
    if gpt.rank() == 0:
        os.makedirs(filename, exist_ok=True)

    # now sync since only root has created directory
    gpt.barrier()

    # write eigenvalues
    if gpt.rank() == 0:
        f = open("%s/eigen-values.txt" % filename, "wt")
        f.write("%d\n" % len(ev))
        for v in ev:
            f.write("%.15E\n" % v)
        f.close()

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # grids
    assert len(basis) > 0
    assert len(cevec) > 0
    fgrid = basis[0].grid
    cgrid = cevec[0].grid

    # mpi layout
    if params["mpi"] is not None:
        mpi = params["mpi"]
    else:
        mpi = fgrid.mpi
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # params
    assert basis[0].checkerboard() == site_cb
    nd = 5
    assert len(fgrid.ldimensions) == nd
    fdimensions = fgrid.fdimensions
    ldimensions = [conformDiv(fdimensions[i], mpi[i]) for i in range(nd)]
    assert fgrid.precision == gpt.single
    s = ldimensions
    b = [
        conformDiv(fgrid.fdimensions[i], cgrid.fdimensions[i])
        for i in range(nd)
    ]
    nb = [conformDiv(s[i], b[i]) for i in range(nd)]
    neigen = len(cevec)
    nbasis = len(basis)
    if "nsingle" in params:
        nsingle = params["nsingle"]
        assert nsingle <= nbasis
    else:
        nsingle = nbasis
    nsingleCap = min([nsingle, nbasis])
    blocks = numpy.prod(nb)
    FP16_COEF_EXP_SHARE_FLOATS = 10

    # write metadata
    if gpt.rank() == 0:
        fmeta = open("%s/metadata.txt" % filename, "wt")
        for i in range(nd):
            fmeta.write("s[%d] = %d\n" % (i, s[(i + 1) % nd]))
        for i in range(nd):
            fmeta.write("b[%d] = %d\n" % (i, b[(i + 1) % nd]))
        for i in range(nd):
            fmeta.write("nb[%d] = %d\n" % (i, nb[(i + 1) % nd]))
        fmeta.write("neig = %d\n" % neigen)
        fmeta.write("nkeep = %d\n" % nbasis)
        fmeta.write("nkeep_single = %d\n" % nsingle)
        fmeta.write("blocks = %d\n" % blocks)
        fmeta.write("FP16_COEF_EXP_SHARE_FLOATS = %d\n" %
                    FP16_COEF_EXP_SHARE_FLOATS)
        fmeta.flush()  # write crc32 later

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)
    crc32 = numpy.array([0] * cv0.ranks, dtype=numpy.uint64)
    # timing
    t0 = gpt.time()
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fwrite = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Saving %s with %d views per node" %
                    (filename, len(views)))

    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)
        if fn is not None:
            os.makedirs(dn, exist_ok=True)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        totalSize = (
            blocks *
            (block_data_size_single * nsingleCap + block_data_size_fp16 *
             (nbasis - nsingleCap)) + neigen * coarse_vector_size)
        totalSizeGB += totalSize / 1024.0**3.0 if v is not None else 0.0

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "wb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # single-precision data
        data = memoryview(bytearray(block_data_size_single * nsingleCap))
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))

        for b in range(read_blocks):
            fgrid.barrier()
            dt_distr -= gpt.time()
            lhs_size = basis[0].otype.nfloats * 4 * len(pos[b])
            lhs = data_munged[0:lhs_size]
            distribute_plan = gpt.copy_plan(lhs, basis[0])
            distribute_plan.destination += gpt.global_memory_view(
                fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]])
            distribute_plan.source += basis[0].view[pos[b]]
            distribute_plan = distribute_plan()
            lhs = None
            for i in range(nsingleCap):
                distribute_plan(
                    data_munged[block_data_size_single *
                                i:block_data_size_single * (i + 1)],
                    basis[i],
                )
            dt_distr += gpt.time()

            if f is not None:
                dt_munge -= gpt.time()
                cgpt.munge_inner_outer(
                    data,
                    data_munged,
                    block_reduce,
                    nsingleCap,
                )
                dt_munge += gpt.time()
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()

            fgrid.barrier()
            dt_fwrite -= gpt.time()
            if f is not None:
                f.write(data)
                globalWriteGB = len(data) / 1024.0**3.0
            else:
                globalWriteGB = 0.0
            globalWriteGB = fgrid.globalsum(globalWriteGB)
            dt_fwrite += gpt.time()
            totalSizeGB += globalWriteGB

            if verbose:
                gpt.message(
                    "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fwrite,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data = memoryview(
                bytearray(block_data_size_fp16 * (nbasis - nsingleCap)))
            for b in range(read_blocks):
                fgrid.barrier()
                dt_distr -= gpt.time()
                lhs_size = basis[0].otype.nfloats * 4 * len(pos[b])
                lhs = data_munged[0:lhs_size]
                distribute_plan = gpt.copy_plan(lhs, basis[0])
                distribute_plan.destination += gpt.global_memory_view(
                    fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]])
                distribute_plan.source += basis[0].view[pos[b]]
                distribute_plan = distribute_plan()
                lhs = None
                for i in range(nsingleCap, nbasis):
                    j = i - nsingleCap
                    distribute_plan(
                        data_munged[j * block_data_size_single:(j + 1) *
                                    block_data_size_single],
                        basis[i],
                    )
                dt_distr += gpt.time()

                if f is not None:
                    dt_munge -= gpt.time()
                    cgpt.munge_inner_outer(
                        data_fp32,
                        data_munged,
                        block_reduce,
                        nbasis - nsingleCap,
                    )
                    dt_munge += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp32_to_fp16(data, data_fp32, 24)
                    dt_fp16 += gpt.time()
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()

                fgrid.barrier()
                dt_fwrite -= gpt.time()
                if f is not None:
                    f.write(data)
                    globalWriteGB = len(data) / 1024.0**3.0
                else:
                    globalWriteGB = 0.0
                globalWriteGB = fgrid.globalsum(globalWriteGB)
                dt_fwrite += gpt.time()
                totalSizeGB += globalWriteGB

                if verbose:
                    gpt.message(
                        "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fwrite,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                        ))

        # coarse grid data
        data = memoryview(bytearray(coarse_vector_size))
        data_fp32 = memoryview(
            bytearray(cevec[0].otype.nfloats * 4 * len(pos_coarse)))
        distribute_plan = gpt.copy_plan(data_fp32, cevec[0])
        distribute_plan.destination += gpt.global_memory_view(
            cgrid, [[cgrid.processor, data_fp32, 0, data_fp32.nbytes]])
        distribute_plan.source += cevec[0].view[pos_coarse]
        distribute_plan = distribute_plan()
        for j in range(neigen):
            fgrid.barrier()
            dt_distr -= gpt.time()
            distribute_plan(data_fp32, cevec[j])
            dt_distr += gpt.time()

            if f is not None:
                dt_fp16 -= gpt.time()
                cgpt.fp32_to_mixed_fp32fp16(
                    data,
                    data_fp32,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()

            fgrid.barrier()
            dt_fwrite -= gpt.time()
            if f is not None:
                f.write(data)
                globalWriteGB = len(data) / 1024.0**3.0
            else:
                globalWriteGB = 0.0
            globalWriteGB = fgrid.globalsum(globalWriteGB)
            dt_fwrite += gpt.time()
            totalSizeGB += globalWriteGB

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fwrite,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                    ))

        # save crc
        crc32[cv.rank] = crc32_comp

    # synchronize crc32
    fgrid.globalsum(crc32)

    # timing
    t1 = gpt.time()

    # write crc to metadata
    if gpt.rank() == 0:
        for i in range(len(crc32)):
            fmeta.write("crc32[%d] = %X\n" % (i, crc32[i]))
        fmeta.close()

    # verbosity
    if verbose:
        gpt.message("* save %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))