def save(self, obj): if type(obj) == list: for o in obj: self.save(o) elif type(obj) == gpt.lattice: self.save(obj.mview()) elif type(obj) == float: self.save(memoryview(struct.pack("d", obj))) elif type(obj) == complex: self.save(memoryview(struct.pack("dd", obj.real, obj.imag))) elif type(obj) == memoryview: self.f.seek(0, 1) sz = len(obj) szGB = sz / 1024.0**3 self.f.write(sz.to_bytes(8, "little")) t0 = gpt.time() self.f.write(gpt.crc32(obj).to_bytes(4, "little")) t1 = gpt.time() self.f.write(obj) self.f.flush() t2 = gpt.time() if self.verbose: if self.grid is None: gpt.message( "Checkpoint %g GB on head node at %g GB/s for crc32 and %g GB/s for write in %g s total" % (szGB, szGB / (t1 - t0), szGB / (t2 - t1), t2 - t0)) else: szGB = self.grid.globalsum(szGB) gpt.message( "Checkpoint %g GB at %g GB/s for crc32 and %g GB/s for write in %g s total" % (szGB, szGB / (t1 - t0), szGB / (t2 - t1), t2 - t0)) else: assert 0
def save(filename, objs, params): t0 = gpt.time() # create io x = gpt_io(filename, params, True) # create index f = io.StringIO("") x.create_index(f, "", objs) mvidx = memoryview(f.getvalue().encode("utf-8")) # write index to fs index_crc = gpt.crc32(mvidx) if gpt.rank() == 0: open(filename + "/index", "wb").write(mvidx) open(filename + "/index.crc32", "wt").write("%X\n" % index_crc) # close x.close() # goodbye if x.verbose: t1 = gpt.time() gpt.message("Completed writing %s in %g s" % (filename, t1 - t0))
def write_numpy(self, a): if not self.glb is None: pos = self.glb.tell() buf = io.BytesIO() numpy.save(buf, a, allow_pickle=False) mv = memoryview(buf.getvalue()) crc = gpt.crc32(mv) self.glb.write(crc.to_bytes(4, byteorder='little')) self.glb.write(mv) return pos, self.glb.tell() return 0, 0
def read_numpy(self, start, end): if gpt.rank() == 0: self.glb.seek(start, 0) crc32_compare = int.from_bytes(self.glb.read(4), byteorder='little') data = self.glb.read(end - start - 4) else: data = None crc32_compare = None data = gpt.broadcast(0, data) crc32_computed = gpt.crc32(memoryview(data)) if not crc32_compare is None: assert (crc32_computed == crc32_compare) return numpy.load(io.BytesIO(data))
def flush(self): # if we read, no need to flush if self.index_file is None: return # get memoryview of current index mvidx = memoryview(self.index_file.getvalue().encode("utf-8")) # write index to fs index_crc = gpt.crc32(mvidx) if gpt.rank() == 0: f = open(self.root + "/index", "wb") f.write(mvidx) f.close() f = open(self.root + "/index.crc32", "wt") f.write("%X\n" % index_crc) f.close()
def read_view(self, obj): pos = self.f.tell() self.f.seek(0, 2) flags = numpy.array([0.0, 1.0, 0.0], dtype=numpy.float64) t0 = gpt.time() if self.f.tell() != pos: self.f.seek(pos, 0) # try to read sz = int.from_bytes(self.f.read(8), "little") szGB = sz / 1024.0**3 flags[2] = szGB crc32_expected = int.from_bytes(self.f.read(4), "little") if len(obj) == sz: data = self.f.read(sz) if len(data) == sz: obj[:] = data crc32 = gpt.crc32(obj) if crc32 == crc32_expected: flags[0] = 1.0 # flag success on this node # compare global assert self.grid is not None self.grid.globalsum(flags) t1 = gpt.time() # report status if self.verbose and flags[2] != 0.0: if flags[0] != flags[1]: gpt.message("Checkpoint %g GB failed on %g out of %g nodes" % (flags[2], flags[1] - flags[0], flags[1])) else: gpt.message( "Checkpoint %g GB at %g GB/s for crc32 and read combined in %g s total" % (flags[2], flags[2] / (t1 - t0), t1 - t0)) # all nodes OK? if flags[0] == flags[1]: return True # reset position to overwrite corruption self.f.seek(pos, 0) return False
def load(filename, params): # first check if this is right file format if not (os.path.exists(filename + "/index.crc32") and os.path.exists(filename + "/global")): raise NotImplementedError() # timing t0 = gpt.time() # create io x = gpt_io(filename, False, params) if x.verbose: gpt.message("Reading %s" % filename) # read index idx = open(filename + "/index", "rb").read() crc_expected = int(open(filename + "/index.crc32", "rt").read(), 16) crc_computed = gpt.crc32(memoryview(idx)) assert crc_expected == crc_computed p = index_parser(idx.decode("utf-8", "strict").split("\n")) res = x.read_index(p) # if multiple chunks are available, return them as a list if not p.eof(): res = [res] while not p.eof(): res.append(x.read_index(p)) # close x.close() # goodbye if x.verbose: t1 = gpt.time() gpt.message("Completed reading %s in %g s" % (filename, t1 - t0)) return res
def load(filename, *a): # first check if this is right file format if not os.path.exists(filename + "/index.crc32"): raise NotImplementedError() # parameters if len(a) == 0: params = {} else: params = a[0] # timing t0 = gpt.time() # create io x = gpt_io(filename, params, False) if x.verbose: gpt.message("Reading %s" % filename) # read index idx = open(filename + "/index", "rb").read() crc_expected = int(open(filename + "/index.crc32", "rt").read(), 16) crc_computed = gpt.crc32(memoryview(idx)) assert (crc_expected == crc_computed) p = index_parser(idx.decode("utf-8", "strict").split("\n")) res = x.read_index(p) # close x.close() # goodbye if x.verbose: t1 = gpt.time() gpt.message("Completed reading %s in %g s" % (filename, t1 - t0)) return res
def read_lattice(self, a): g_desc = a[0] cv_desc = a[1] l_desc = a[2] filepos = [int(x) for x in a[3:]] # first find grid if not g_desc in self.params["grids"]: self.params["grids"][g_desc] = gpt.grid(g_cesc) g = self.params["grids"][g_desc] # create a cartesian view and lattice to load l = gpt.lattice(g, l_desc) cv0 = gpt.cartesian_view(-1, cv_desc, g.fdimensions, g.cb, l.checkerboard()) # find tasks for my node views_for_node = self.views_for_node(cv0, g) # performance dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0 szGB = 0.0 g.barrier() t0 = gpt.time() # need to load all views for xk, iview in enumerate(views_for_node): g.barrier() dt_read -= gpt.time() f, pos = self.open_view(xk, iview, False, cv_desc, g.fdimensions, g.cb, l.checkerboard()) if not f is None: f.seek(filepos[iview], 0) ntag = int.from_bytes(f.read(4), byteorder='little') f.read(ntag) # not needed if index is present crc_exp = int.from_bytes(f.read(4), byteorder='little') nd = int.from_bytes(f.read(4), byteorder='little') f.read(8 * nd) # not needed if index is present sz = int.from_bytes(f.read(8), byteorder='little') data = memoryview(f.read(sz)) dt_crc -= gpt.time() crc_comp = gpt.crc32(data) dt_crc += gpt.time() assert (crc_comp == crc_exp) sys.stdout.flush() szGB += len(data) / 1024.**3. else: assert (len(pos) == 0) data = None g.barrier() dt_read += gpt.time() dt_distr -= gpt.time() l[pos] = data g.barrier() dt_distr += gpt.time() g.barrier() t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)" % (szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_read, szGB / dt_crc, len(views_for_node))) # TODO: # split grid exposure, allow cgpt_distribute to be given a communicator # and take it in importexport.h, add debug info here # more benchmarks, useful to create a plan for cgpt_distribute and cache? immutable numpy array returned from coordinates, attach plan return l
def write_lattice(self, ctx, l): g = l.grid tag = (ctx + "\0").encode("utf-8") ntag = len(tag) nd = len(g.fdimensions) # create cartesian view for writing if "mpi" in self.params: mpi = self.params["mpi"] else: mpi = g.mpi cv0 = gpt.cartesian_view(-1, mpi, g.fdimensions, g.cb, l.checkerboard()) # file positions pos = numpy.array([0] * cv0.ranks, dtype=numpy.uint64) # describe res = g.describe() + " " + cv0.describe() + " " + l.describe() # find tasks for my node views_for_node = self.views_for_node(cv0, g) # performance dt_distr, dt_crc, dt_write = 0.0, 0.0, 0.0 #g.barrier() t0 = gpt.time() szGB = 0.0 # need to write all views for xk, iview in enumerate(views_for_node): f, p = self.open_view(xk, iview, True, mpi, g.fdimensions, g.cb, l.checkerboard()) # all nodes are needed to communicate dt_distr -= gpt.time() mv = gpt.mview(l[p]) dt_distr += gpt.time() # write data if not f is None: # description and data dt_crc -= gpt.time() crc = gpt.crc32(mv) dt_crc += gpt.time() dt_write -= gpt.time() pos[iview] = f.tell() f.write(ntag.to_bytes(4, byteorder='little')) f.write(tag) f.write(crc.to_bytes(4, byteorder='little')) f.write(nd.to_bytes(4, byteorder='little')) for i in range(nd): f.write(g.gdimensions[i].to_bytes(4, byteorder='little')) for i in range(nd): f.write(g.mpi[i].to_bytes(4, byteorder='little')) f.write(len(mv).to_bytes(8, byteorder='little')) f.write(mv) f.flush() dt_write += gpt.time() szGB += len(mv) / 1024.**3. t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Wrote %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for checksum, %g GB/s for writing, %d views per node)" % (szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_crc, szGB / dt_write, len(views_for_node))) g.globalsum(pos) return res + " " + " ".join(["%d" % x for x in pos])
def read_lattice(self, a): g_desc = a[0] cv_desc = a[1] l_desc = a[2] filepos = [int(x) for x in a[3:]] # first find grid if g_desc not in self.params["grids"]: self.params["grids"][g_desc] = gpt.grid_from_description(g_desc) g = self.params["grids"][g_desc] # create a cartesian view and lattice to load l = gpt.lattice(g, l_desc) cv0 = gpt.cartesian_view(-1, cv_desc, g.fdimensions, g.cb, l.checkerboard()) # find tasks for my node views_for_node = self.views_for_node(cv0, g) # performance dt_distr, dt_crc, dt_read = 0.0, 0.0, 0.0 szGB = 0.0 g.barrier() t0 = gpt.time() # need to load all views for xk, iview in enumerate(views_for_node): g.barrier() dt_read -= gpt.time() f, pos = self.open_view(xk, iview, False, cv_desc, g.fdimensions, g.cb, l.checkerboard()) cache_key = f"{a[0:3]}_{g.obj}_{iview}_read" if cache_key not in self.cache: self.cache[cache_key] = {} if f is not None: f.seek(filepos[iview], 0) ntag = int.from_bytes(f.read(4), byteorder="little") f.read(ntag) # not needed if index is present crc_exp = int.from_bytes(f.read(4), byteorder="little") nd = int.from_bytes(f.read(4), byteorder="little") f.read(8 * nd) # not needed if index is present sz = int.from_bytes(f.read(8), byteorder="little") data = memoryview(f.read(sz)) dt_crc -= gpt.time() crc_comp = gpt.crc32(data) dt_crc += gpt.time() assert crc_comp == crc_exp sys.stdout.flush() szGB += len(data) / 1024.0**3.0 else: assert len(pos) == 0 data = None g.barrier() dt_read += gpt.time() dt_distr -= gpt.time() l[pos, self.cache[cache_key]] = data g.barrier() dt_distr += gpt.time() g.barrier() t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)" % ( szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_read, szGB / dt_crc, len(views_for_node), )) return l
def read_lattice_single(self): if self.bytes_header < 0: raise # define grid from header g = gpt.grid(self.fdimensions, self.precision) # create lattice l = gpt.lattice(g, self.otype) # performance dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0 szGB = 0.0 crc_comp = 0 g.barrier() t0 = gpt.time() dt_read -= gpt.time() pos, nreader = distribute_cartesian_file(self.fdimensions, g, l.checkerboard()) if len(pos) > 0: f = gpt.FILE(self.path, "rb") sz = self.size * len(pos) f.seek(self.bytes_header + g.processor * sz, 0) data = memoryview(f.read(sz)) f.close() dt_crc -= gpt.time() crc_comp = gpt.crc32(data) dt_crc += gpt.time() dt_misc -= gpt.time() self.swap(data) dt_misc += gpt.time() szGB += len(data) / 1024.0 ** 3.0 else: data = memoryview(bytearray()) g.barrier() dt_read += gpt.time() crc_array = numpy.array([0] * (2 * nreader), numpy.uint64) if g.processor < nreader: crc_array[2 * g.processor + 0] = sz crc_array[2 * g.processor + 1] = crc_comp g.globalsum(crc_array) crc_comp = 0x0 for i in range(nreader): crc_comp = cgpt.util_crc32_combine( crc_comp, crc_array[2 * i + 1], crc_array[2 * i + 0] ) crc_comp = f"{crc_comp:8X}" assert crc_comp == self.crc_exp # distributes data accordingly dt_distr -= gpt.time() l[pos] = data g.barrier() dt_distr += gpt.time() g.barrier() t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d nreaders)" % ( szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_read, szGB / dt_crc, nreader, ) ) return l
#!/usr/bin/env python3 # # Authors: Christoph Lehner 2020 # # Desc.: Illustrate core concepts and features # import gpt import hashlib import zlib test = b"Test this string" sha256_comp = "%x" % gpt.sha256(test) m = hashlib.sha256() m.update(test) sha256_ref = m.hexdigest() gpt.message(sha256_comp, sha256_ref) assert sha256_comp == sha256_ref crc32_comp = "%x" % gpt.crc32(test) crc32_ref = "%x" % zlib.crc32(test) gpt.message(crc32_comp, crc32_ref) assert crc32_comp == crc32_ref gpt.message("Tests successful")
def read_lattice_single(self): if self.bytes_header < 0: raise # define grid from header g = gpt.grid(self.fdimensions, self.precision) # create lattice l = gpt.lattice(g, self.otype) # performance dt_distr, dt_crc, dt_read, dt_misc = 0.0, 0.0, 0.0, 0.0 szGB = 0.0 crc_comp = 0 g.barrier() t0 = gpt.time() # single file: each rank opens it and reads it all g.barrier() dt_read -= gpt.time() cv = gpt.cartesian_view(gpt.rank(), self.cv_desc, g.fdimensions, g.cb, l.checkerboard()) pos = gpt.coordinates(cv) if gpt.rank() == 0: f = gpt.FILE(self.path, "rb") f.seek(self.bytes_header, 0) sz = self.size * int(numpy.prod(g.fdimensions)) data = memoryview(bytearray(f.read(sz))) f.close() dt_crc -= gpt.time() crc_comp = gpt.crc32(data) crc_comp = f"{crc_comp:8X}" assert crc_comp == self.crc_exp dt_crc += gpt.time() dt_misc -= gpt.time() self.swap(data) dt_misc += gpt.time() sys.stdout.flush() szGB += len(data) / 1024.0**3.0 else: assert len(pos) == 0 data = None g.barrier() dt_read += gpt.time() # distributes data accordingly dt_distr -= gpt.time() l[pos] = data g.barrier() dt_distr += gpt.time() g.barrier() t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Read %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for reading + checksum, %g GB/s for checksum, %d views per node)" % ( szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_read, szGB / dt_crc, 1, )) return l
def load(filename, params): # first check if this is right file format if not os.path.exists(filename + "/00/0000000000.compressed" ) or not os.path.exists(filename + "/metadata.txt"): raise NotImplementedError() # verbosity verbose = gpt.default.is_verbose("io") # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # need grids parameter assert params["grids"] is not None assert type(params["grids"]) == gpt.grid fgrid = params["grids"] assert fgrid.precision == gpt.single fdimensions = fgrid.fdimensions # read metadata metadata = read_metadata(filename + "/metadata.txt") s = get_ivec(metadata, "s") ldimensions = [s[4]] + s[:4] blocksize = get_ivec(metadata, "b") blocksize = [blocksize[4]] + blocksize[:4] nb = get_ivec(metadata, "nb") nb = [nb[4]] + nb[:4] crc32 = get_xvec(metadata, "crc32") neigen = int(metadata["neig"]) nbasis = int(metadata["nkeep"]) nsingle = int(metadata["nkeep_single"]) blocks = int(metadata["blocks"]) FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"]) nsingleCap = min([nsingle, nbasis]) # check nd = len(ldimensions) assert nd == 5 assert nd == len(fdimensions) assert nd == len(blocksize) assert fgrid.cb.n == 2 assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1] # create coarse grid cgrid = gpt.block.grid(fgrid, blocksize) # allow for partial loading of data if params["nmax"] is not None: nmax = params["nmax"] nbasis_max = min([nmax, nbasis]) neigen_max = min([nmax, neigen]) nsingleCap_max = min([nmax, nsingleCap]) else: nbasis_max = nbasis neigen_max = neigen nsingleCap_max = nsingleCap # allocate all lattices basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)] cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)] if params["advise_basis"] is not None: gpt.advise(basis, params["advise_basis"]) if params["advise_cevec"] is not None: gpt.advise(cevec, params["advise_cevec"]) # fix checkerboard of basis for i in range(nbasis_max): basis[i].checkerboard(site_cb) # mpi layout mpi = [] for i in range(nd): assert fdimensions[i] % ldimensions[i] == 0 mpi.append(fdimensions[i] // ldimensions[i]) assert mpi[0] == 1 # assert no mpi in 5th direction # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) # timing totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fread = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Loading %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "rb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 gpt.message("Read blocks", blocks) # make read-only to enable caching for x in pos: x.setflags(write=0) # dummy buffer data0 = memoryview(bytes()) # single-precision data data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(block_data_size_single * nsingleCap)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_munge -= gpt.time() # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB cgpt.munge_inner_outer(data_munged, data, nsingleCap, block_reduce) # data_munged: lattice0 lattice1 lattice2 ... dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap_max): distribute_plan( basis[i], data_munged[block_data_size_single * i:block_data_size_single * (i + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, mem_avail(), )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview( f.read(block_data_size_fp16 * (nbasis - nsingleCap))) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.fp16_to_fp32(data_fp32, data, 24) dt_fp16 += gpt.time() dt_munge -= gpt.time() cgpt.munge_inner_outer( data_munged, data_fp32, nbasis - nsingleCap, block_reduce, ) dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() if nsingleCap < nbasis_max: rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap, nbasis_max): j = i - nsingleCap distribute_plan( basis[i], data_munged[block_data_size_single * j:block_data_size_single * (j + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # coarse grid data data_fp32 = memoryview(bytearray(coarse_fp32_vector_size)) distribute_plan = None for j in range(neigen): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(coarse_vector_size)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.mixed_fp32fp16_to_fp32( data_fp32, data, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() data = data_fp32 else: data = data0 fgrid.barrier() dt_distr -= gpt.time() if j < neigen_max: if distribute_plan is None: distribute_plan = gpt.copy_plan(cevec[j], data) distribute_plan.destination += cevec[j].view[pos_coarse] distribute_plan.source += gpt.global_memory_view( cgrid, [[cgrid.processor, data, 0, data.nbytes]]) distribute_plan = distribute_plan() distribute_plan(cevec[j], data) dt_distr += gpt.time() if verbose and j % (neigen // 10) == 0: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # crc checks if f is not None: assert crc32_comp == crc32[cv.rank] # timing t1 = gpt.time() # verbosity if verbose: gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0))) # eigenvalues evln = list( filter(lambda x: x != "", open(filename + "/eigen-values.txt").read().split("\n"))) nev = int(evln[0]) ev = [float(x) for x in evln[1:]] assert len(ev) == nev return (basis, cevec, ev)
def save(filename, objs, params): # split data to save assert len(objs) == 3 basis = objs[0] cevec = objs[1] ev = objs[2] # verbosity verbose = gpt.default.is_verbose("io") if verbose: gpt.message( "Saving %d basis vectors, %d coarse-grid vectors, %d eigenvalues to %s" % (len(basis), len(cevec), len(ev), filename)) # create directory if gpt.rank() == 0: os.makedirs(filename, exist_ok=True) # now sync since only root has created directory gpt.barrier() # write eigenvalues if gpt.rank() == 0: f = open("%s/eigen-values.txt" % filename, "wt") f.write("%d\n" % len(ev)) for v in ev: f.write("%.15E\n" % v) f.close() # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # grids assert len(basis) > 0 assert len(cevec) > 0 fgrid = basis[0].grid cgrid = cevec[0].grid # mpi layout if params["mpi"] is not None: mpi = params["mpi"] else: mpi = fgrid.mpi assert mpi[0] == 1 # assert no mpi in 5th direction # params assert basis[0].checkerboard() == site_cb nd = 5 assert len(fgrid.ldimensions) == nd fdimensions = fgrid.fdimensions ldimensions = [conformDiv(fdimensions[i], mpi[i]) for i in range(nd)] assert fgrid.precision == gpt.single s = ldimensions b = [ conformDiv(fgrid.fdimensions[i], cgrid.fdimensions[i]) for i in range(nd) ] nb = [conformDiv(s[i], b[i]) for i in range(nd)] neigen = len(cevec) nbasis = len(basis) if "nsingle" in params: nsingle = params["nsingle"] assert nsingle <= nbasis else: nsingle = nbasis nsingleCap = min([nsingle, nbasis]) blocks = numpy.prod(nb) FP16_COEF_EXP_SHARE_FLOATS = 10 # write metadata if gpt.rank() == 0: fmeta = open("%s/metadata.txt" % filename, "wt") for i in range(nd): fmeta.write("s[%d] = %d\n" % (i, s[(i + 1) % nd])) for i in range(nd): fmeta.write("b[%d] = %d\n" % (i, b[(i + 1) % nd])) for i in range(nd): fmeta.write("nb[%d] = %d\n" % (i, nb[(i + 1) % nd])) fmeta.write("neig = %d\n" % neigen) fmeta.write("nkeep = %d\n" % nbasis) fmeta.write("nkeep_single = %d\n" % nsingle) fmeta.write("blocks = %d\n" % blocks) fmeta.write("FP16_COEF_EXP_SHARE_FLOATS = %d\n" % FP16_COEF_EXP_SHARE_FLOATS) fmeta.flush() # write crc32 later # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) crc32 = numpy.array([0] * cv0.ranks, dtype=numpy.uint64) # timing t0 = gpt.time() totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fwrite = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Saving %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) if fn is not None: os.makedirs(dn, exist_ok=True) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks totalSize = ( blocks * (block_data_size_single * nsingleCap + block_data_size_fp16 * (nbasis - nsingleCap)) + neigen * coarse_vector_size) totalSizeGB += totalSize / 1024.0**3.0 if v is not None else 0.0 # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "wb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 # make read-only to enable caching for x in pos: x.setflags(write=0) # single-precision data data = memoryview(bytearray(block_data_size_single * nsingleCap)) data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() lhs_size = basis[0].otype.nfloats * 4 * len(pos[b]) lhs = data_munged[0:lhs_size] distribute_plan = gpt.copy_plan(lhs, basis[0]) distribute_plan.destination += gpt.global_memory_view( fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]]) distribute_plan.source += basis[0].view[pos[b]] distribute_plan = distribute_plan() lhs = None for i in range(nsingleCap): distribute_plan( data_munged[block_data_size_single * i:block_data_size_single * (i + 1)], basis[i], ) dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() cgpt.munge_inner_outer( data, data_munged, block_reduce, nsingleCap, ) dt_munge += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data = memoryview( bytearray(block_data_size_fp16 * (nbasis - nsingleCap))) for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() lhs_size = basis[0].otype.nfloats * 4 * len(pos[b]) lhs = data_munged[0:lhs_size] distribute_plan = gpt.copy_plan(lhs, basis[0]) distribute_plan.destination += gpt.global_memory_view( fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]]) distribute_plan.source += basis[0].view[pos[b]] distribute_plan = distribute_plan() lhs = None for i in range(nsingleCap, nbasis): j = i - nsingleCap distribute_plan( data_munged[j * block_data_size_single:(j + 1) * block_data_size_single], basis[i], ) dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() cgpt.munge_inner_outer( data_fp32, data_munged, block_reduce, nbasis - nsingleCap, ) dt_munge += gpt.time() dt_fp16 -= gpt.time() cgpt.fp32_to_fp16(data, data_fp32, 24) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # coarse grid data data = memoryview(bytearray(coarse_vector_size)) data_fp32 = memoryview( bytearray(cevec[0].otype.nfloats * 4 * len(pos_coarse))) distribute_plan = gpt.copy_plan(data_fp32, cevec[0]) distribute_plan.destination += gpt.global_memory_view( cgrid, [[cgrid.processor, data_fp32, 0, data_fp32.nbytes]]) distribute_plan.source += cevec[0].view[pos_coarse] distribute_plan = distribute_plan() for j in range(neigen): fgrid.barrier() dt_distr -= gpt.time() distribute_plan(data_fp32, cevec[j]) dt_distr += gpt.time() if f is not None: dt_fp16 -= gpt.time() cgpt.fp32_to_mixed_fp32fp16( data, data_fp32, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose and j % (neigen // 10) == 0: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # save crc crc32[cv.rank] = crc32_comp # synchronize crc32 fgrid.globalsum(crc32) # timing t1 = gpt.time() # write crc to metadata if gpt.rank() == 0: for i in range(len(crc32)): fmeta.write("crc32[%d] = %X\n" % (i, crc32[i])) fmeta.close() # verbosity if verbose: gpt.message("* save %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0)))