def split_lattices(lattices, lcoor, gcoor, split_grid, N): # N is desired number of parallel split lattices per unsplit lattice # 1 <= N <= sranks, sranks % N == 0 assert len(lcoor) == len(gcoor) n = len(lattices) assert n > 0 grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)]) cb = lattices[0].checkerboard() assert all([lattices[i].checkerboard() is cb for i in range(1, n)]) otype = lattices[0].otype assert all( [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)]) assert n % N == 0 Q = n // N l = [gpt.lattice(split_grid, otype) for i in range(N)] for x in l: x.checkerboard(cb) x.split_lcoor = lcoor x.split_gcoor = gcoor sranks = split_grid.sranks srank = split_grid.srank for i in range(Q): if i == srank // (sranks // Q): lc = lcoor gc = gcoor else: lc = numpy.empty(shape=(0, split_grid.nd), dtype=numpy.int32) gc = lc gpt.poke(l, lc, gpt.peek(lattices[i * N:(i + 1) * N], gc)) return l
def unsplit(first, second): if type(first) != list: return unsplit([first], [second]) n = len(first) N = len(second) split_grid = second[0].grid sranks = split_grid.sranks srank = split_grid.srank Q = n // N assert n % N == 0 lcoor = second[0].split_lcoor gcoor = second[0].split_gcoor for i in range(Q): if i == srank // (sranks // Q): lc = lcoor gc = gcoor else: lc = numpy.empty(shape=(0, split_grid.nd), dtype=numpy.int32) gc = lc gpt.poke(first[i * N:(i + 1) * N], gc, gpt.peek(second, lc))
def merge(lattices, dimension=-1, N=-1): # if only one lattice is given, return immediately if type(lattices) != list: return lattices # number of lattices n = len(lattices) assert n > 0 # number of batches if N == -1: N = n batches = n // N assert n % N == 0 # all grids need to be the same grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)]) # allow negative indexing if dimension < 0: dimension += grid.nd + 1 assert dimension >= 0 else: assert dimension <= grid.nd # infer checkerboarding of new dimension cb = [x.checkerboard() for x in lattices] if cb[0] is gpt.none: assert all([x is gpt.none for x in cb[1:]]) cb_mask = 0 else: assert all( [ cb[j * N + i] is cb[j * N + i + 1].inv() for i in range(N - 1) for j in range(batches) ] ) cb_mask = 1 # otypes must be consistent otype = lattices[0].otype assert all([lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)]) # create merged grid merged_grid = grid.inserted_dimension(dimension, N, cb_mask=cb_mask) # create merged lattices and set checkerboard merged_lattices = [gpt.lattice(merged_grid, otype) for i in range(batches)] for x in merged_lattices: x.checkerboard(cb[0]) # coordinates of source lattices gcoor_zero = lattices[ 0 ].mview_coordinates() # return coordinates in internal ordering, speed up access gcoor_one = ( lattices[1].mview_coordinates() if N > 1 and cb_mask == 1 else gcoor_zero ) gcoor = [gcoor_zero, gcoor_one] # data transfer for i in range(N): merged_gcoor = cgpt.coordinates_inserted_dimension(gcoor[i % 2], dimension, [i]) gpt.poke( merged_lattices, merged_gcoor, gpt.peek([lattices[j * N + i] for j in range(batches)], gcoor[i % 2]), ) # if only one batch, remove list if len(merged_lattices) == 1: return merged_lattices[0] # return return merged_lattices
def load(filename, params): # first check if this is right file format if not os.path.exists(filename + "/00/0000000000.compressed" ) or not os.path.exists(filename + "/metadata.txt"): raise NotImplementedError() # verbosity verbose = gpt.default.is_verbose("io") # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # need grids parameter assert "grids" in params assert type(params["grids"]) == gpt.grid fgrid = params["grids"] assert fgrid.precision == gpt.single fdimensions = fgrid.fdimensions # read metadata metadata = read_metadata(filename + "/metadata.txt") s = get_ivec(metadata, "s") ldimensions = [s[4]] + s[:4] blocksize = get_ivec(metadata, "b") blocksize = [blocksize[4]] + blocksize[:4] nb = get_ivec(metadata, "nb") nb = [nb[4]] + nb[:4] crc32 = get_xvec(metadata, "crc32") neigen = int(metadata["neig"]) nbasis = int(metadata["nkeep"]) nsingle = int(metadata["nkeep_single"]) blocks = int(metadata["blocks"]) FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"]) nsingleCap = min([nsingle, nbasis]) # check nd = len(ldimensions) assert nd == 5 assert nd == len(fdimensions) assert nd == len(blocksize) assert fgrid.cb.n == 2 assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1] # create coarse grid cgrid = gpt.block.grid(fgrid, blocksize) # allow for partial loading of data if "nmax" in params: nmax = params["nmax"] nbasis_max = min([nmax, nbasis]) neigen_max = min([nmax, neigen]) nsingleCap_max = min([nmax, nsingleCap]) else: nbasis_max = nbasis neigen_max = neigen nsingleCap_max = nsingleCap # allocate all lattices basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)] cevec = [gpt.vcomplex(cgrid, nbasis_max) for i in range(neigen_max)] if "advise_basis" in params: gpt.advise(basis, params["advise_basis"]) if "advise_cevec" in params: gpt.advise(cevec, params["advise_cevec"]) # fix checkerboard of basis for i in range(nbasis_max): basis[i].checkerboard(site_cb) # mpi layout mpi = [] for i in range(nd): assert fdimensions[i] % ldimensions[i] == 0 mpi.append(fdimensions[i] // ldimensions[i]) assert mpi[0] == 1 # assert no mpi in 5th direction # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) # timing totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fread = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Loading %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "rb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 # make read-only to enable caching for x in pos: x.setflags(write=0) # dummy buffer data0 = memoryview(bytes()) # single-precision data data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) reduced_size = len(data_munged) // block_reduce for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(block_data_size_single * nsingleCap)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_munge -= gpt.time() for l in range(block_reduce): cgpt.munge_inner_outer( data_munged[reduced_size * l:reduced_size * (l + 1)], data[reduced_size * l:reduced_size * (l + 1)], len(pos[b]) // block_reduce, nsingleCap, ) dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() gpt.poke( basis[0:nsingleCap_max], pos[b], truncate(data_munged, nsingleCap, nsingleCap_max, len(pos[b])), ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, mem_avail(), )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) reduced_size = len(data_munged) // block_reduce for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview( f.read(block_data_size_fp16 * (nbasis - nsingleCap))) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.fp16_to_fp32(data_fp32, data, 24) dt_fp16 += gpt.time() dt_munge -= gpt.time() for l in range(block_reduce): cgpt.munge_inner_outer( data_munged[reduced_size * l:reduced_size * (l + 1)], data_fp32[reduced_size * l:reduced_size * (l + 1)], len(pos[b]) // block_reduce, nsingleCap, ) dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() if nsingleCap < nbasis_max: gpt.poke( basis[nsingleCap:nbasis_max], pos[b], truncate( data_munged, nbasis - nsingleCap, nbasis_max - nsingleCap, len(pos[b]), ), ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # coarse grid data data_fp32 = memoryview(bytearray(coarse_fp32_vector_size)) for j in range(neigen): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(coarse_vector_size)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.mixed_fp32fp16_to_fp32( data_fp32, data, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() data = data_fp32 else: data = data0 fgrid.barrier() dt_distr -= gpt.time() if j < neigen_max: cevec[j][pos_coarse] = truncate(data, nbasis, nbasis_max, len(pos_coarse)) dt_distr += gpt.time() if verbose and j % (neigen // 10) == 0: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # crc checks if f is not None: assert crc32_comp == crc32[cv.rank] # timing t1 = gpt.time() # verbosity if verbose: gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0))) # eigenvalues evln = list( filter(lambda x: x != "", open(filename + "/eigen-values.txt").read().split("\n"))) nev = int(evln[0]) ev = [float(x) for x in evln[1:]] assert len(ev) == nev return (basis, cevec, ev)
def separate(lattices, dimension=-1): # expect list below if type(lattices) != list: lattices = [lattices] # evaluate in case it is an expression lattices = [gpt.eval(x) for x in lattices] # number of batches to separate batches = len(lattices) assert batches > 0 # make sure all have the same grid grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, batches)]) # allow negative indexing if dimension < 0: dimension += grid.nd assert dimension >= 0 else: assert dimension < grid.nd # number of slices (per batch) N = grid.fdimensions[dimension] n = N * batches # all lattices need to have same checkerboard cb = lattices[0].checkerboard() assert all([lattices[i].checkerboard() is cb for i in range(1, batches)]) # all lattices need to have same otype otype = lattices[0].otype assert all( [lattices[i].otype.__name__ == otype.__name__ for i in range(1, batches)] ) # create grid with dimension removed separated_grid = grid.removed_dimension(dimension) cb_mask = grid.cb.cb_mask[dimension] # create separate lattices and set their checkerboard separated_lattices = [gpt.lattice(separated_grid, otype) for i in range(n)] for i, x in enumerate(separated_lattices): j = i % N if cb_mask == 0 or j % 2 == 0: x.checkerboard(cb) else: x.checkerboard(cb.inv()) # construct coordinates separated_gcoor_zero = separated_lattices[0].mview_coordinates() separated_gcoor_one = ( separated_lattices[1].mview_coordinates() if N > 1 and cb_mask == 1 else separated_gcoor_zero ) separated_gcoor = [separated_gcoor_zero, separated_gcoor_one] # move data for i in range(N): gcoor = cgpt.coordinates_inserted_dimension( separated_gcoor[i % 2], dimension, [i] ) gpt.poke( [separated_lattices[j * N + i] for j in range(batches)], separated_gcoor[i % 2], gpt.peek(lattices, gcoor), ) # return return separated_lattices