def testMethod(file, usearray, testwrite, testread, complib, complevel, ngroups, ntables, nrows): if complevel > 0: print("Compression library:", complib) if testwrite: t1 = clock() cpu1 = cpuclock() if usearray: (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows) else: (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows, complevel, complib, recsize) t2 = clock() cpu2 = cpuclock() tapprows = t2 - t1 cpuapprows = cpu2 - cpu1 print(f"Rows written: {rowsw} Row size: {rowsz}") print( f"Time writing rows: {tapprows:.3f} s (real) " f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") print(f"Write rows/sec: {rowsw / tapprows}") print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") if testread: t1 = clock() cpu1 = cpuclock() if usearray: (rowsr, rowsz, bufsz) = readFileArr(file, ngroups, recsize, verbose) else: (rowsr, rowsz, bufsz) = readFile(file, ngroups, recsize, verbose) t2 = clock() cpu2 = cpuclock() treadrows = t2 - t1 cpureadrows = cpu2 - cpu1 print(f"Rows read: {rowsw} Row size: {rowsz}, Buf size: {bufsz}") print( f"Time reading rows: {treadrows:.3f} s (real) " f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") print(f"Read rows/sec: {rowsr / treadrows}") print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}")
elif option[0] == '-i': nrows = int(option[1]) if debug: gc.enable() gc.set_debug(gc.DEBUG_LEAK) # Catch the hdf5 file passed as the last argument file = pargs[0] print("Compression level:", complevel) if complevel > 0: print("Compression library:", complib) if testwrite: t1 = clock() cpu1 = cpuclock() if psyco_imported and usepsyco: psyco.bind(createFile) if usearray: (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows) else: (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows, complevel, complib, recsize) t2 = clock() cpu2 = cpuclock() tapprows = t2 - t1 cpuapprows = cpu2 - cpu1 print(f"Rows written: {rowsw} Row size: {rowsz}") print(f"Time writing rows: {tapprows:.3f} s (real) " f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") print(f"Write rows/sec: {rowsw / tapprows}")
def main(): global verbose global regoldindexes global createsysattrs parser = _get_parser() args = parser.parse_args() # check arguments if args.rng: try: args.rng = eval("slice(" + args.rng + ")") except Exception: parser.error("Error when getting the range parameter.") if args.chunkshape.isdigit() or args.chunkshape.startswith('('): args.chunkshape = eval(args.chunkshape) if args.complevel < 0 or args.complevel > 9: parser.error( 'invalid "complevel" value, it sould be in te range [0, 9]') # Catch the files passed as the last arguments src = args.src.rsplit(':', 1) dst = args.dst.rsplit(':', 1) if len(src) == 1: srcfile, srcnode = src[0], "/" else: srcfile, srcnode = src if len(dst) == 1: dstfile, dstnode = dst[0], "/" else: dstfile, dstnode = dst if srcnode == "": # case where filename == "filename:" instead of "filename:/" srcnode = "/" if dstnode == "": # case where filename == "filename:" instead of "filename:/" dstnode = "/" # Ignore the warnings for tables that contains oldindexes # (these will be handled by the copying routines) warnings.filterwarnings("ignore", category=tb.exceptions.OldIndexWarning) # Ignore the flavors warnings during upgrading flavor operations if args.upgradeflavors: warnings.filterwarnings("ignore", category=tb.exceptions.FlavorWarning) # Build the Filters instance filter_params = ( args.complevel, args.complib, args.shuffle, args.bitshuffle, args.fletcher32, ) if (filter_params == (None, ) * 4 or args.keepfilters): filters = None else: if args.complevel is None: args.complevel = 0 if args.shuffle is None: if args.complevel > 0: args.shuffle = True else: args.shuffle = False if args.bitshuffle is None: args.bitshuffle = False if args.bitshuffle: # Shuffle and bitshuffle are mutually exclusive args.shuffle = False if args.complib is None: args.complib = "zlib" if args.fletcher32 is None: args.fletcher32 = False filters = tb.leaf.Filters(complevel=args.complevel, complib=args.complib, shuffle=args.shuffle, bitshuffle=args.bitshuffle, fletcher32=args.fletcher32) # The start, stop and step params: start, stop, step = None, None, 1 # Defaults if args.rng: start, stop, step = args.rng.start, args.rng.stop, args.rng.step # Set globals verbose = args.verbose regoldindexes = args.regoldindexes createsysattrs = args.createsysattrs # Some timing t1 = clock() cpu1 = cpuclock() # Copy the file if verbose: print("+=+" * 20) print("Recursive copy:", args.recursive) print("Applying filters:", filters) if args.sortby is not None: print("Sorting table(s) by column:", args.sortby) print("Forcing a CSI creation:", args.checkCSI) if args.propindexes: print("Recreating indexes in copied table(s)") print(f"Start copying {srcfile}:{srcnode} to {dstfile}:{dstnode}") print("+=+" * 20) allow_padding = not args.dont_allow_padding # Check whether the specified source node is a group or a leaf h5srcfile = tb.open_file(srcfile, 'r', allow_padding=allow_padding) srcnodeobject = h5srcfile.get_node(srcnode) # Close the file again h5srcfile.close() stats = {'groups': 0, 'leaves': 0, 'links': 0, 'bytes': 0, 'hardlinks': 0} if isinstance(srcnodeobject, tb.group.Group): copy_children(srcfile, dstfile, srcnode, dstnode, title=args.title, recursive=args.recursive, filters=filters, copyuserattrs=args.copyuserattrs, overwritefile=args.overwritefile, overwrtnodes=args.overwrtnodes, stats=stats, start=start, stop=stop, step=step, chunkshape=args.chunkshape, sortby=args.sortby, check_CSI=args.checkCSI, propindexes=args.propindexes, upgradeflavors=args.upgradeflavors, allow_padding=allow_padding, use_hardlinks=True) else: # If not a Group, it should be a Leaf copy_leaf( srcfile, dstfile, srcnode, dstnode, title=args.title, filters=filters, copyuserattrs=args.copyuserattrs, overwritefile=args.overwritefile, overwrtnodes=args.overwrtnodes, stats=stats, start=start, stop=stop, step=step, chunkshape=args.chunkshape, sortby=args.sortby, check_CSI=args.checkCSI, propindexes=args.propindexes, upgradeflavors=args.upgradeflavors, allow_padding=allow_padding, ) # Gather some statistics t2 = clock() cpu2 = cpuclock() tcopy = t2 - t1 cpucopy = cpu2 - cpu1 if verbose: ngroups = stats['groups'] nleaves = stats['leaves'] nlinks = stats['links'] nhardlinks = stats['hardlinks'] nbytescopied = stats['bytes'] nnodes = ngroups + nleaves + nlinks + nhardlinks print( "Groups copied:", ngroups, ", Leaves copied:", nleaves, ", Links copied:", nlinks, ", Hard links copied:", nhardlinks, ) if args.copyuserattrs: print("User attrs copied") else: print("User attrs not copied") print(f"KBytes copied: {nbytescopied / 1024:.3f}") print(f"Time copying: {tcopy:.3f} s (real) {cpucopy:.3f} s " f"(cpu) {cpucopy / tcopy:.0%}") print(f"Copied nodes/sec: {nnodes / tcopy:.1f}") print(f"Copied KB/s : {nbytescopied / tcopy / 1024:.0f}")
def createFile(filename, nrows, filters, index, heavy, noise, verbose): # Open a file in "w"rite mode fileh = tb.open_file(filename, mode="w", title="Searchsorted Benchmark", filters=filters) rowswritten = 0 # Create the test table table = fileh.create_table(fileh.root, 'table', Small, "test table", None, nrows) t1 = clock() cpu1 = cpuclock() nrowsbuf = table.nrowsinbuf minimum = 0 maximum = nrows for i in range(0, nrows, nrowsbuf): if i + nrowsbuf > nrows: j = nrows else: j = i + nrowsbuf if randomvalues: var3 = np.random.uniform(minimum, maximum, size=j - i) else: var3 = np.arange(i, j, dtype=np.float64) if noise > 0: var3 += np.random.uniform(-noise, noise, size=j - i) var2 = np.array(var3, dtype=np.int32) var1 = np.empty(shape=[j - i], dtype="S4") if not heavy: var1[:] = var2 table.append([var3, var2, var1]) table.flush() rowswritten += nrows time1 = clock() - t1 tcpu1 = cpuclock() - cpu1 print(f"Time for filling: {time1:.3f} Krows/s: {nrows / 1000 / time1:.3f}", end=' ') fileh.close() size1 = Path(filename).stat().st_size print(f", File size: {size1 / 1024 / 1024:.3f} MB") fileh = tb.open_file(filename, mode="a", title="Searchsorted Benchmark", filters=filters) table = fileh.root.table rowsize = table.rowsize if index: t1 = clock() cpu1 = cpuclock() # Index all entries if not heavy: indexrows = table.cols.var1.create_index(filters=filters) for colname in ['var2', 'var3']: table.colinstances[colname].create_index(filters=filters) time2 = clock() - t1 tcpu2 = cpuclock() - cpu1 print( f"Time for indexing: {time2:.3f} " f"iKrows/s: {indexrows / 1000 / time2:.3f}", end=' ') else: indexrows = 0 time2 = 0.000_000_000_1 # an ugly hack tcpu2 = 0 if verbose: if index: idx = table.cols.var1.index print("Index parameters:", repr(idx)) else: print("NOT indexing rows") # Close the file fileh.close() size2 = Path(filename).stat().st_size - size1 if index: print(f", Index size: {size2 / 1024 / 1024:.3f} MB") return (rowswritten, indexrows, rowsize, time1, time2, tcpu1, tcpu2, size1, size2)
def readFile(filename, atom, riter, indexmode, dselect, verbose): # Open the HDF5 file in read-only mode fileh = tb.open_file(filename, mode="r") table = fileh.root.table var1 = table.cols.var1 var2 = table.cols.var2 var3 = table.cols.var3 if indexmode == "indexed": if var2.index.nelements > 0: where = table._whereIndexed else: warnings.warn( "Not indexed table or empty index. Defaulting to in-kernel " "selection") indexmode = "inkernel" where = table._whereInRange elif indexmode == "inkernel": where = table.where if verbose: print("Max rows in buf:", table.nrowsinbuf) print("Rows in", table._v_pathname, ":", table.nrows) print("Buffersize:", table.rowsize * table.nrowsinbuf) print("MaxTuples:", table.nrowsinbuf) if indexmode == "indexed": print("Chunk size:", var2.index.sorted.chunksize) print("Number of elements per slice:", var2.index.nelemslice) print("Slice number in", table._v_pathname, ":", var2.index.nrows) #table.nrowsinbuf = 10 # print "nrowsinbuf-->", table.nrowsinbuf rowselected = 0 time2 = 0 tcpu2 = 0 results = [] print("Select mode:", indexmode, ". Selecting for type:", atom) # Initialize the random generator always with the same integer # in order to have reproductible results on each read iteration random.seed(19) np.random.seed(19) for i in range(riter): # The interval for look values at. This is aproximately equivalent to # the number of elements to select rnd = np.random.randint(table.nrows) cpu1 = cpuclock() t1 = clock() if atom == "string": val = str(rnd)[-4:] if indexmode in ["indexed", "inkernel"]: results = [p.nrow for p in where('var1 == val')] else: results = [p.nrow for p in table if p["var1"] == val] elif atom == "int": val = rnd + dselect if indexmode in ["indexed", "inkernel"]: results = [ p.nrow for p in where('(rnd <= var3) & (var3 < val)') ] else: results = [p.nrow for p in table if rnd <= p["var2"] < val] elif atom == "float": val = rnd + dselect if indexmode in ["indexed", "inkernel"]: t1 = clock() results = [ p.nrow for p in where('(rnd <= var3) & (var3 < val)') ] else: results = [ p.nrow for p in table if float(rnd) <= p["var3"] < float(val) ] else: raise ValueError("Value for atom '%s' not supported." % atom) rowselected += len(results) # print "selected values-->", results if i == 0: # First iteration time1 = clock() - t1 tcpu1 = cpuclock() - cpu1 else: if indexmode == "indexed": # if indexed, wait until the 5th iteration (in order to # insure that the index is effectively cached) to take times if i >= 5: time2 += clock() - t1 tcpu2 += cpuclock() - cpu1 else: time2 += clock() - t1 tcpu2 += cpuclock() - cpu1 if riter > 1: if indexmode == "indexed" and riter >= 5: correction = 5 else: correction = 1 time2 = time2 / (riter - correction) tcpu2 = tcpu2 / (riter - correction) if verbose and 1: print("Values that fullfill the conditions:") print(results) #rowsread = table.nrows * riter rowsread = table.nrows rowsize = table.rowsize # Close the file fileh.close() return (rowsread, rowselected, rowsize, time1, time2, tcpu1, tcpu2)
def createFile(filename, nrows, filters, indexmode, heavy, noise, bfile, verbose): # Initialize some variables t1 = 0 t2 = 0 tcpu1 = 0 tcpu2 = 0 rowsecf = 0 rowseci = 0 size1 = 0 size2 = 0 if indexmode == "standard": print("Creating a new database:", dbfile) instd = os.popen("/usr/local/bin/sqlite " + dbfile, "w") CREATESTD = """ CREATE TABLE small ( -- Name Type -- Example --------------------------------------- recnum INTEGER PRIMARY KEY, -- 345 var1 char(4), -- Abronia villosa var2 INTEGER, -- 111 var3 FLOAT -- 12.32 ); """ CREATEIDX = """ CREATE TABLE small ( -- Name Type -- Example --------------------------------------- recnum INTEGER PRIMARY KEY, -- 345 var1 char(4), -- Abronia villosa var2 INTEGER, -- 111 var3 FLOAT -- 12.32 ); CREATE INDEX ivar1 ON small(var1); CREATE INDEX ivar2 ON small(var2); CREATE INDEX ivar3 ON small(var3); """ # Creating the table first and indexing afterwards is a bit faster instd.write(CREATESTD) instd.close() conn = sqlite.connect(dbfile) cursor = conn.cursor() if indexmode == "standard": place_holders = ",".join(['%s'] * 3) # Insert rows SQL = "insert into small values(NULL, %s)" % place_holders time1 = clock() cpu1 = cpuclock() # This way of filling is to copy the PyTables benchmark nrowsbuf = 1000 minimum = 0 maximum = nrows for i in range(0, nrows, nrowsbuf): if i + nrowsbuf > nrows: j = nrows else: j = i + nrowsbuf if randomvalues: var3 = np.random.uniform(minimum, maximum, shape=[j - i]) else: var3 = np.arange(i, j, type=np.Float64) if noise: var3 += np.random.uniform(-3, 3, shape=[j - i]) var2 = np.array(var3, type=np.Int32) var1 = np.array(None, shape=[j - i], dtype='s4') if not heavy: for n in range(j - i): var1[n] = str("%.4s" % var2[n]) for n in range(j - i): fields = (var1[n], var2[n], var3[n]) cursor.execute(SQL, fields) conn.commit() t1 = clock() - time1 tcpu1 = cpuclock() - cpu1 rowsecf = nrows / t1 size1 = os.stat(dbfile).st_size print(f"******** Results for writing nrows = {nrows} *********") print(f"Insert time: {t1:.5f}, KRows/s: {nrows / 1000 / t1:.3f}") print(f", File size: {size1 / 1024 / 1024:.3f} MB") # Indexem if indexmode == "indexed": time1 = clock() cpu1 = cpuclock() if not heavy: cursor.execute("CREATE INDEX ivar1 ON small(var1)") conn.commit() cursor.execute("CREATE INDEX ivar2 ON small(var2)") conn.commit() cursor.execute("CREATE INDEX ivar3 ON small(var3)") conn.commit() t2 = clock() - time1 tcpu2 = cpuclock() - cpu1 rowseci = nrows / t2 print(f"Index time: {t2:.5f}, IKRows/s: {nrows / 1000 / t2:.3f}") size2 = os.stat(dbfile).st_size - size1 print(f", Final size with index: {size2 / 1024 / 1024:.3f} MB") conn.close() # Collect benchmark data bf = open_file(bfile, "a") recsize = "sqlite_small" if indexmode == "indexed": table = bf.get_node("/" + recsize + "/create_indexed") else: table = bf.get_node("/" + recsize + "/create_standard") table.row["nrows"] = nrows table.row["irows"] = nrows table.row["tfill"] = t1 table.row["tidx"] = t2 table.row["tcfill"] = tcpu1 table.row["tcidx"] = tcpu2 table.row["psyco"] = psycon table.row["rowsecf"] = rowsecf table.row["rowseci"] = rowseci table.row["fsize"] = size1 table.row["isize"] = size2 table.row.append() bf.close() return
def readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter): # Connect to the database. conn = sqlite.connect(db=dbfile, mode=755) # Obtain a cursor cursor = conn.cursor() # select count(*), avg(var2) SQL1 = """ select recnum from small where var1 = %s """ SQL2 = """ select recnum from small where var2 >= %s and var2 < %s """ SQL3 = """ select recnum from small where var3 >= %s and var3 < %s """ # Open the benchmark database bf = open_file(bfile, "a") # default values for the case that columns are not indexed t2 = 0 tcpu2 = 0 # Some previous computations for the case of random values if randomvalues: # algorithm to choose a value separated from mean # If want to select fewer values, select this # if nrows/2 > standarddeviation*3: # Choose five standard deviations away from mean value # dev = standarddeviation*5 # dev = standarddeviation*math.log10(nrows/1000.) # This algorithm give place to too asymmetric result values # if standarddeviation*10 < nrows/2: # Choose four standard deviations away from mean value # dev = standarddeviation*4 # else: # dev = 100 # Yet Another Algorithm if nrows / 2 > standarddeviation * 10: dev = standarddeviation * 4 elif nrows / 2 > standarddeviation: dev = standarddeviation * 2 elif nrows / 2 > standarddeviation / 10: dev = standarddeviation / 10 else: dev = standarddeviation / 100 valmax = round(nrows / 2 - dev) # split the selection range in regular chunks if riter > valmax * 2: riter = valmax * 2 chunksize = (valmax * 2 / riter) * 10 # Get a list of integers for the intervals randlist = range(0, valmax, chunksize) randlist.extend(range(nrows - valmax, nrows, chunksize)) # expand the list ten times so as to use the cache randlist = randlist * 10 # shuffle the list random.shuffle(randlist) # reset the value of chunksize chunksize = chunksize / 10 # print "chunksize-->", chunksize # randlist.sort();print "randlist-->", randlist else: chunksize = 3 if heavy: searchmodelist = ["int", "float"] else: searchmodelist = ["string", "int", "float"] # Execute queries for atom in searchmodelist: time2 = 0 cpu2 = 0 rowsel = 0 for i in range(riter): rnd = random.randrange(nrows) time1 = clock() cpu1 = cpuclock() if atom == "string": #cursor.execute(SQL1, "1111") cursor.execute(SQL1, str(rnd)[-4:]) elif atom == "int": #cursor.execute(SQL2 % (rnd, rnd+3)) cursor.execute(SQL2 % (rnd, rnd + dselect)) elif atom == "float": #cursor.execute(SQL3 % (float(rnd), float(rnd+3))) cursor.execute(SQL3 % (float(rnd), float(rnd + dselect))) else: raise ValueError( "atom must take a value in ['string','int','float']") if i == 0: t1 = clock() - time1 tcpu1 = cpuclock() - cpu1 else: if indexmode == "indexed": # if indexed, wait until the 5th iteration to take # times (so as to insure that the index is # effectively cached) if i >= 5: time2 += clock() - time1 cpu2 += cpuclock() - cpu1 else: time2 += clock() - time1 time2 += cpuclock() - cpu1 if riter > 1: if indexmode == "indexed" and riter >= 5: correction = 5 else: correction = 1 t2 = time2 / (riter - correction) tcpu2 = cpu2 / (riter - correction) print( f"*** Query results for atom = {atom}, " f"nrows = {nrows}, indexmode = {indexmode} ***") print(f"Query time: {t1:.5f}, cached time: {t2:.5f}") print(f"MRows/s: {nrows / 1_000_000 / t1:.3f}", end=' ') if t2 > 0: print(f", cached MRows/s: {nrows / 10 ** 6 / t2:.3f}") else: print() # Collect benchmark data recsize = "sqlite_small" tablepath = "/" + recsize + "/search/" + indexmode + "/" + atom table = bf.get_node(tablepath) table.row["nrows"] = nrows table.row["rowsel"] = rowsel table.row["time1"] = t1 table.row["time2"] = t2 table.row["tcpu1"] = tcpu1 table.row["tcpu2"] = tcpu2 table.row["psyco"] = psycon table.row["rowsec1"] = nrows / t1 if t2 > 0: table.row["rowsec2"] = nrows / t2 table.row.append() table.flush() # Flush the data # Close the database conn.close() bf.close() # the bench database return