Beispiel #1
0
def testMethod(file, usearray, testwrite, testread, complib, complevel,
               ngroups, ntables, nrows):

    if complevel > 0:
        print("Compression library:", complib)
    if testwrite:
        t1 = clock()
        cpu1 = cpuclock()
        if usearray:
            (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows)
        else:
            (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows,
                                        complevel, complib, recsize)
        t2 = clock()
        cpu2 = cpuclock()
        tapprows = t2 - t1
        cpuapprows = cpu2 - cpu1
        print(f"Rows written: {rowsw}  Row size: {rowsz}")
        print(
            f"Time writing rows: {tapprows:.3f} s (real) "
            f"{cpuapprows:.3f} s (cpu)  {cpuapprows / tapprows:.0%}")
        print(f"Write rows/sec:  {rowsw / tapprows}")
        print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}")

    if testread:
        t1 = clock()
        cpu1 = cpuclock()
        if usearray:
            (rowsr, rowsz, bufsz) = readFileArr(file,
                                                ngroups, recsize, verbose)
        else:
            (rowsr, rowsz, bufsz) = readFile(file, ngroups, recsize, verbose)
        t2 = clock()
        cpu2 = cpuclock()
        treadrows = t2 - t1
        cpureadrows = cpu2 - cpu1
        print(f"Rows read: {rowsw}  Row size: {rowsz}, Buf size: {bufsz}")
        print(
            f"Time reading rows: {treadrows:.3f} s (real) "
            f"{cpureadrows:.3f} s (cpu)  {cpureadrows / treadrows:.0%}")
        print(f"Read rows/sec:  {rowsr / treadrows}")
        print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}")
Beispiel #2
0
        elif option[0] == '-i':
            nrows = int(option[1])

    if debug:
        gc.enable()
        gc.set_debug(gc.DEBUG_LEAK)

    # Catch the hdf5 file passed as the last argument
    file = pargs[0]

    print("Compression level:", complevel)
    if complevel > 0:
        print("Compression library:", complib)
    if testwrite:
        t1 = clock()
        cpu1 = cpuclock()
        if psyco_imported and usepsyco:
            psyco.bind(createFile)
        if usearray:
            (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows)
        else:
            (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows,
                                        complevel, complib, recsize)
        t2 = clock()
        cpu2 = cpuclock()
        tapprows = t2 - t1
        cpuapprows = cpu2 - cpu1
        print(f"Rows written: {rowsw}  Row size: {rowsz}")
        print(f"Time writing rows: {tapprows:.3f} s (real) "
              f"{cpuapprows:.3f} s (cpu)  {cpuapprows / tapprows:.0%}")
        print(f"Write rows/sec:  {rowsw / tapprows}")
Beispiel #3
0
def main():
    global verbose
    global regoldindexes
    global createsysattrs

    parser = _get_parser()
    args = parser.parse_args()

    # check arguments
    if args.rng:
        try:
            args.rng = eval("slice(" + args.rng + ")")
        except Exception:
            parser.error("Error when getting the range parameter.")

    if args.chunkshape.isdigit() or args.chunkshape.startswith('('):
        args.chunkshape = eval(args.chunkshape)

    if args.complevel < 0 or args.complevel > 9:
        parser.error(
            'invalid "complevel" value, it sould be in te range [0, 9]')

    # Catch the files passed as the last arguments
    src = args.src.rsplit(':', 1)
    dst = args.dst.rsplit(':', 1)
    if len(src) == 1:
        srcfile, srcnode = src[0], "/"
    else:
        srcfile, srcnode = src
    if len(dst) == 1:
        dstfile, dstnode = dst[0], "/"
    else:
        dstfile, dstnode = dst

    if srcnode == "":
        # case where filename == "filename:" instead of "filename:/"
        srcnode = "/"

    if dstnode == "":
        # case where filename == "filename:" instead of "filename:/"
        dstnode = "/"

    # Ignore the warnings for tables that contains oldindexes
    # (these will be handled by the copying routines)
    warnings.filterwarnings("ignore", category=tb.exceptions.OldIndexWarning)

    # Ignore the flavors warnings during upgrading flavor operations
    if args.upgradeflavors:
        warnings.filterwarnings("ignore", category=tb.exceptions.FlavorWarning)

    # Build the Filters instance
    filter_params = (
        args.complevel,
        args.complib,
        args.shuffle,
        args.bitshuffle,
        args.fletcher32,
    )
    if (filter_params == (None, ) * 4 or args.keepfilters):
        filters = None
    else:
        if args.complevel is None:
            args.complevel = 0
        if args.shuffle is None:
            if args.complevel > 0:
                args.shuffle = True
            else:
                args.shuffle = False
        if args.bitshuffle is None:
            args.bitshuffle = False
        if args.bitshuffle:
            # Shuffle and bitshuffle are mutually exclusive
            args.shuffle = False
        if args.complib is None:
            args.complib = "zlib"
        if args.fletcher32 is None:
            args.fletcher32 = False
        filters = tb.leaf.Filters(complevel=args.complevel,
                                  complib=args.complib,
                                  shuffle=args.shuffle,
                                  bitshuffle=args.bitshuffle,
                                  fletcher32=args.fletcher32)

    # The start, stop and step params:
    start, stop, step = None, None, 1  # Defaults
    if args.rng:
        start, stop, step = args.rng.start, args.rng.stop, args.rng.step

    # Set globals
    verbose = args.verbose
    regoldindexes = args.regoldindexes
    createsysattrs = args.createsysattrs

    # Some timing
    t1 = clock()
    cpu1 = cpuclock()
    # Copy the file
    if verbose:
        print("+=+" * 20)
        print("Recursive copy:", args.recursive)
        print("Applying filters:", filters)
        if args.sortby is not None:
            print("Sorting table(s) by column:", args.sortby)
            print("Forcing a CSI creation:", args.checkCSI)
        if args.propindexes:
            print("Recreating indexes in copied table(s)")
        print(f"Start copying {srcfile}:{srcnode} to {dstfile}:{dstnode}")
        print("+=+" * 20)

    allow_padding = not args.dont_allow_padding
    # Check whether the specified source node is a group or a leaf
    h5srcfile = tb.open_file(srcfile, 'r', allow_padding=allow_padding)
    srcnodeobject = h5srcfile.get_node(srcnode)

    # Close the file again
    h5srcfile.close()

    stats = {'groups': 0, 'leaves': 0, 'links': 0, 'bytes': 0, 'hardlinks': 0}
    if isinstance(srcnodeobject, tb.group.Group):
        copy_children(srcfile,
                      dstfile,
                      srcnode,
                      dstnode,
                      title=args.title,
                      recursive=args.recursive,
                      filters=filters,
                      copyuserattrs=args.copyuserattrs,
                      overwritefile=args.overwritefile,
                      overwrtnodes=args.overwrtnodes,
                      stats=stats,
                      start=start,
                      stop=stop,
                      step=step,
                      chunkshape=args.chunkshape,
                      sortby=args.sortby,
                      check_CSI=args.checkCSI,
                      propindexes=args.propindexes,
                      upgradeflavors=args.upgradeflavors,
                      allow_padding=allow_padding,
                      use_hardlinks=True)
    else:
        # If not a Group, it should be a Leaf
        copy_leaf(
            srcfile,
            dstfile,
            srcnode,
            dstnode,
            title=args.title,
            filters=filters,
            copyuserattrs=args.copyuserattrs,
            overwritefile=args.overwritefile,
            overwrtnodes=args.overwrtnodes,
            stats=stats,
            start=start,
            stop=stop,
            step=step,
            chunkshape=args.chunkshape,
            sortby=args.sortby,
            check_CSI=args.checkCSI,
            propindexes=args.propindexes,
            upgradeflavors=args.upgradeflavors,
            allow_padding=allow_padding,
        )

    # Gather some statistics
    t2 = clock()
    cpu2 = cpuclock()
    tcopy = t2 - t1
    cpucopy = cpu2 - cpu1
    if verbose:
        ngroups = stats['groups']
        nleaves = stats['leaves']
        nlinks = stats['links']
        nhardlinks = stats['hardlinks']
        nbytescopied = stats['bytes']
        nnodes = ngroups + nleaves + nlinks + nhardlinks

        print(
            "Groups copied:",
            ngroups,
            ", Leaves copied:",
            nleaves,
            ", Links copied:",
            nlinks,
            ", Hard links copied:",
            nhardlinks,
        )
        if args.copyuserattrs:
            print("User attrs copied")
        else:
            print("User attrs not copied")
        print(f"KBytes copied: {nbytescopied / 1024:.3f}")
        print(f"Time copying: {tcopy:.3f} s (real) {cpucopy:.3f} s "
              f"(cpu)  {cpucopy / tcopy:.0%}")
        print(f"Copied nodes/sec: {nnodes / tcopy:.1f}")
        print(f"Copied KB/s : {nbytescopied / tcopy / 1024:.0f}")
Beispiel #4
0
def createFile(filename, nrows, filters, index, heavy, noise, verbose):

    # Open a file in "w"rite mode
    fileh = tb.open_file(filename,
                         mode="w",
                         title="Searchsorted Benchmark",
                         filters=filters)
    rowswritten = 0

    # Create the test table
    table = fileh.create_table(fileh.root, 'table', Small, "test table", None,
                               nrows)

    t1 = clock()
    cpu1 = cpuclock()
    nrowsbuf = table.nrowsinbuf
    minimum = 0
    maximum = nrows
    for i in range(0, nrows, nrowsbuf):
        if i + nrowsbuf > nrows:
            j = nrows
        else:
            j = i + nrowsbuf
        if randomvalues:
            var3 = np.random.uniform(minimum, maximum, size=j - i)
        else:
            var3 = np.arange(i, j, dtype=np.float64)
            if noise > 0:
                var3 += np.random.uniform(-noise, noise, size=j - i)
        var2 = np.array(var3, dtype=np.int32)
        var1 = np.empty(shape=[j - i], dtype="S4")
        if not heavy:
            var1[:] = var2
        table.append([var3, var2, var1])
    table.flush()
    rowswritten += nrows
    time1 = clock() - t1
    tcpu1 = cpuclock() - cpu1
    print(f"Time for filling: {time1:.3f} Krows/s: {nrows / 1000 / time1:.3f}",
          end=' ')
    fileh.close()
    size1 = Path(filename).stat().st_size
    print(f", File size: {size1 / 1024 / 1024:.3f} MB")
    fileh = tb.open_file(filename,
                         mode="a",
                         title="Searchsorted Benchmark",
                         filters=filters)
    table = fileh.root.table
    rowsize = table.rowsize
    if index:
        t1 = clock()
        cpu1 = cpuclock()
        # Index all entries
        if not heavy:
            indexrows = table.cols.var1.create_index(filters=filters)
        for colname in ['var2', 'var3']:
            table.colinstances[colname].create_index(filters=filters)
        time2 = clock() - t1
        tcpu2 = cpuclock() - cpu1
        print(
            f"Time for indexing: {time2:.3f} "
            f"iKrows/s: {indexrows / 1000 / time2:.3f}",
            end=' ')
    else:
        indexrows = 0
        time2 = 0.000_000_000_1  # an ugly hack
        tcpu2 = 0

    if verbose:
        if index:
            idx = table.cols.var1.index
            print("Index parameters:", repr(idx))
        else:
            print("NOT indexing rows")
    # Close the file
    fileh.close()

    size2 = Path(filename).stat().st_size - size1
    if index:
        print(f", Index size: {size2 / 1024 / 1024:.3f} MB")
    return (rowswritten, indexrows, rowsize, time1, time2, tcpu1, tcpu2, size1,
            size2)
Beispiel #5
0
def readFile(filename, atom, riter, indexmode, dselect, verbose):
    # Open the HDF5 file in read-only mode

    fileh = tb.open_file(filename, mode="r")
    table = fileh.root.table
    var1 = table.cols.var1
    var2 = table.cols.var2
    var3 = table.cols.var3
    if indexmode == "indexed":
        if var2.index.nelements > 0:
            where = table._whereIndexed
        else:
            warnings.warn(
                "Not indexed table or empty index. Defaulting to in-kernel "
                "selection")
            indexmode = "inkernel"
            where = table._whereInRange
    elif indexmode == "inkernel":
        where = table.where
    if verbose:
        print("Max rows in buf:", table.nrowsinbuf)
        print("Rows in", table._v_pathname, ":", table.nrows)
        print("Buffersize:", table.rowsize * table.nrowsinbuf)
        print("MaxTuples:", table.nrowsinbuf)
        if indexmode == "indexed":
            print("Chunk size:", var2.index.sorted.chunksize)
            print("Number of elements per slice:", var2.index.nelemslice)
            print("Slice number in", table._v_pathname, ":", var2.index.nrows)

    #table.nrowsinbuf = 10
    # print "nrowsinbuf-->", table.nrowsinbuf
    rowselected = 0
    time2 = 0
    tcpu2 = 0
    results = []
    print("Select mode:", indexmode, ". Selecting for type:", atom)
    # Initialize the random generator always with the same integer
    # in order to have reproductible results on each read iteration
    random.seed(19)
    np.random.seed(19)
    for i in range(riter):
        # The interval for look values at. This is aproximately equivalent to
        # the number of elements to select
        rnd = np.random.randint(table.nrows)
        cpu1 = cpuclock()
        t1 = clock()
        if atom == "string":
            val = str(rnd)[-4:]
            if indexmode in ["indexed", "inkernel"]:
                results = [p.nrow for p in where('var1 == val')]
            else:
                results = [p.nrow for p in table if p["var1"] == val]
        elif atom == "int":
            val = rnd + dselect
            if indexmode in ["indexed", "inkernel"]:
                results = [
                    p.nrow for p in where('(rnd <= var3) & (var3 < val)')
                ]
            else:
                results = [p.nrow for p in table if rnd <= p["var2"] < val]
        elif atom == "float":
            val = rnd + dselect
            if indexmode in ["indexed", "inkernel"]:
                t1 = clock()
                results = [
                    p.nrow for p in where('(rnd <= var3) & (var3 < val)')
                ]
            else:
                results = [
                    p.nrow for p in table
                    if float(rnd) <= p["var3"] < float(val)
                ]
        else:
            raise ValueError("Value for atom '%s' not supported." % atom)
        rowselected += len(results)
        # print "selected values-->", results
        if i == 0:
            # First iteration
            time1 = clock() - t1
            tcpu1 = cpuclock() - cpu1
        else:
            if indexmode == "indexed":
                # if indexed, wait until the 5th iteration (in order to
                # insure that the index is effectively cached) to take times
                if i >= 5:
                    time2 += clock() - t1
                    tcpu2 += cpuclock() - cpu1
            else:
                time2 += clock() - t1
                tcpu2 += cpuclock() - cpu1

    if riter > 1:
        if indexmode == "indexed" and riter >= 5:
            correction = 5
        else:
            correction = 1
        time2 = time2 / (riter - correction)
        tcpu2 = tcpu2 / (riter - correction)
    if verbose and 1:
        print("Values that fullfill the conditions:")
        print(results)

    #rowsread = table.nrows * riter
    rowsread = table.nrows
    rowsize = table.rowsize

    # Close the file
    fileh.close()

    return (rowsread, rowselected, rowsize, time1, time2, tcpu1, tcpu2)
Beispiel #6
0
def createFile(filename, nrows, filters, indexmode, heavy, noise, bfile,
               verbose):

    # Initialize some variables
    t1 = 0
    t2 = 0
    tcpu1 = 0
    tcpu2 = 0
    rowsecf = 0
    rowseci = 0
    size1 = 0
    size2 = 0

    if indexmode == "standard":
        print("Creating a new database:", dbfile)
        instd = os.popen("/usr/local/bin/sqlite " + dbfile, "w")
        CREATESTD = """
CREATE TABLE small (
-- Name         Type            -- Example
---------------------------------------
recnum  INTEGER PRIMARY KEY,  -- 345
var1            char(4),        -- Abronia villosa
var2            INTEGER,        -- 111
var3            FLOAT        --  12.32
);
"""
        CREATEIDX = """
CREATE TABLE small (
-- Name         Type            -- Example
---------------------------------------
recnum  INTEGER PRIMARY KEY,  -- 345
var1            char(4),        -- Abronia villosa
var2            INTEGER,        -- 111
var3            FLOAT        --  12.32
);
CREATE INDEX ivar1 ON small(var1);
CREATE INDEX ivar2 ON small(var2);
CREATE INDEX ivar3 ON small(var3);
"""
        # Creating the table first and indexing afterwards is a bit faster
        instd.write(CREATESTD)
        instd.close()

    conn = sqlite.connect(dbfile)
    cursor = conn.cursor()
    if indexmode == "standard":
        place_holders = ",".join(['%s'] * 3)
        # Insert rows
        SQL = "insert into small values(NULL, %s)" % place_holders
        time1 = clock()
        cpu1 = cpuclock()
        # This way of filling is to copy the PyTables benchmark
        nrowsbuf = 1000
        minimum = 0
        maximum = nrows
        for i in range(0, nrows, nrowsbuf):
            if i + nrowsbuf > nrows:
                j = nrows
            else:
                j = i + nrowsbuf
            if randomvalues:
                var3 = np.random.uniform(minimum, maximum, shape=[j - i])
            else:
                var3 = np.arange(i, j, type=np.Float64)
                if noise:
                    var3 += np.random.uniform(-3, 3, shape=[j - i])
            var2 = np.array(var3, type=np.Int32)
            var1 = np.array(None, shape=[j - i], dtype='s4')
            if not heavy:
                for n in range(j - i):
                    var1[n] = str("%.4s" % var2[n])
            for n in range(j - i):
                fields = (var1[n], var2[n], var3[n])
                cursor.execute(SQL, fields)
            conn.commit()
        t1 = clock() - time1
        tcpu1 = cpuclock() - cpu1
        rowsecf = nrows / t1
        size1 = os.stat(dbfile).st_size
        print(f"******** Results for writing nrows = {nrows} *********")
        print(f"Insert time: {t1:.5f}, KRows/s: {nrows / 1000 / t1:.3f}")
        print(f", File size: {size1 / 1024 / 1024:.3f} MB")

    # Indexem
    if indexmode == "indexed":
        time1 = clock()
        cpu1 = cpuclock()
        if not heavy:
            cursor.execute("CREATE INDEX ivar1 ON small(var1)")
            conn.commit()
        cursor.execute("CREATE INDEX ivar2 ON small(var2)")
        conn.commit()
        cursor.execute("CREATE INDEX ivar3 ON small(var3)")
        conn.commit()
        t2 = clock() - time1
        tcpu2 = cpuclock() - cpu1
        rowseci = nrows / t2
        print(f"Index time: {t2:.5f}, IKRows/s: {nrows / 1000 / t2:.3f}")
        size2 = os.stat(dbfile).st_size - size1
        print(f", Final size with index: {size2 / 1024 / 1024:.3f} MB")

    conn.close()

    # Collect benchmark data
    bf = open_file(bfile, "a")
    recsize = "sqlite_small"
    if indexmode == "indexed":
        table = bf.get_node("/" + recsize + "/create_indexed")
    else:
        table = bf.get_node("/" + recsize + "/create_standard")
    table.row["nrows"] = nrows
    table.row["irows"] = nrows
    table.row["tfill"] = t1
    table.row["tidx"] = t2
    table.row["tcfill"] = tcpu1
    table.row["tcidx"] = tcpu2
    table.row["psyco"] = psycon
    table.row["rowsecf"] = rowsecf
    table.row["rowseci"] = rowseci
    table.row["fsize"] = size1
    table.row["isize"] = size2
    table.row.append()
    bf.close()

    return
Beispiel #7
0
def readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter):
    # Connect to the database.
    conn = sqlite.connect(db=dbfile, mode=755)
    # Obtain a cursor
    cursor = conn.cursor()

    #      select count(*), avg(var2)
    SQL1 = """
    select recnum
    from small where var1 = %s
    """
    SQL2 = """
    select recnum
    from small where var2 >= %s and var2 < %s
    """
    SQL3 = """
    select recnum
    from small where var3 >= %s and var3 < %s
    """

    # Open the benchmark database
    bf = open_file(bfile, "a")
    # default values for the case that columns are not indexed
    t2 = 0
    tcpu2 = 0
    # Some previous computations for the case of random values
    if randomvalues:
        # algorithm to choose a value separated from mean
# If want to select fewer values, select this
#         if nrows/2 > standarddeviation*3:
# Choose five standard deviations away from mean value
#             dev = standarddeviation*5
# dev = standarddeviation*math.log10(nrows/1000.)

        # This algorithm give place to too asymmetric result values
#         if standarddeviation*10 < nrows/2:
# Choose four standard deviations away from mean value
#             dev = standarddeviation*4
#         else:
#             dev = 100
        # Yet Another Algorithm
        if nrows / 2 > standarddeviation * 10:
            dev = standarddeviation * 4
        elif nrows / 2 > standarddeviation:
            dev = standarddeviation * 2
        elif nrows / 2 > standarddeviation / 10:
            dev = standarddeviation / 10
        else:
            dev = standarddeviation / 100

        valmax = round(nrows / 2 - dev)
        # split the selection range in regular chunks
        if riter > valmax * 2:
            riter = valmax * 2
        chunksize = (valmax * 2 / riter) * 10
        # Get a list of integers for the intervals
        randlist = range(0, valmax, chunksize)
        randlist.extend(range(nrows - valmax, nrows, chunksize))
        # expand the list ten times so as to use the cache
        randlist = randlist * 10
        # shuffle the list
        random.shuffle(randlist)
        # reset the value of chunksize
        chunksize = chunksize / 10
        # print "chunksize-->", chunksize
        # randlist.sort();print "randlist-->", randlist
    else:
        chunksize = 3
    if heavy:
        searchmodelist = ["int", "float"]
    else:
        searchmodelist = ["string", "int", "float"]

    # Execute queries
    for atom in searchmodelist:
        time2 = 0
        cpu2 = 0
        rowsel = 0
        for i in range(riter):
            rnd = random.randrange(nrows)
            time1 = clock()
            cpu1 = cpuclock()
            if atom == "string":
                #cursor.execute(SQL1, "1111")
                cursor.execute(SQL1, str(rnd)[-4:])
            elif atom == "int":
                #cursor.execute(SQL2 % (rnd, rnd+3))
                cursor.execute(SQL2 % (rnd, rnd + dselect))
            elif atom == "float":
                #cursor.execute(SQL3 % (float(rnd), float(rnd+3)))
                cursor.execute(SQL3 % (float(rnd), float(rnd + dselect)))
            else:
                raise ValueError(
                    "atom must take a value in ['string','int','float']")
            if i == 0:
                t1 = clock() - time1
                tcpu1 = cpuclock() - cpu1
            else:
                if indexmode == "indexed":
                    # if indexed, wait until the 5th iteration to take
                    # times (so as to insure that the index is
                    # effectively cached)
                    if i >= 5:
                        time2 += clock() - time1
                        cpu2 += cpuclock() - cpu1
                else:
                    time2 += clock() - time1
                    time2 += cpuclock() - cpu1
        if riter > 1:
            if indexmode == "indexed" and riter >= 5:
                correction = 5
            else:
                correction = 1
            t2 = time2 / (riter - correction)
            tcpu2 = cpu2 / (riter - correction)

        print(
            f"*** Query results for atom = {atom}, "
            f"nrows = {nrows}, indexmode = {indexmode} ***")
        print(f"Query time: {t1:.5f}, cached time: {t2:.5f}")
        print(f"MRows/s: {nrows / 1_000_000 / t1:.3f}", end=' ')
        if t2 > 0:
            print(f", cached MRows/s: {nrows / 10 ** 6 / t2:.3f}")
        else:
            print()

        # Collect benchmark data
        recsize = "sqlite_small"
        tablepath = "/" + recsize + "/search/" + indexmode + "/" + atom
        table = bf.get_node(tablepath)
        table.row["nrows"] = nrows
        table.row["rowsel"] = rowsel
        table.row["time1"] = t1
        table.row["time2"] = t2
        table.row["tcpu1"] = tcpu1
        table.row["tcpu2"] = tcpu2
        table.row["psyco"] = psycon
        table.row["rowsec1"] = nrows / t1
        if t2 > 0:
            table.row["rowsec2"] = nrows / t2
        table.row.append()
        table.flush()  # Flush the data

    # Close the database
    conn.close()
    bf.close()  # the bench database

    return