Ejemplos de fp_format en Python, ejemplos de h2o_util.fp_format en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_many_fp_formats.py Proyecto: 100star/h2o

    def test_many_fp_formats(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (100, 100, 'cB', 180),
            (100000, 10, 'cA', 180),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            print "Will do %s" % NUM_CASES
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                hex_key = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, doSummary=False)
                h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100)

                print "Parse result['destination_key']:", hex_key
                inspect = h2o_cmd.runInspect(None, hex_key)
                print "Removing", hex_key
                h2o.nodes[0].remove_key(hex_key)

Ejemplo n.º 2

0

Mostrar archivo

    def test_many_fp_formats(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (100, 100, 'cB', 180),
            (100000, 10, 'cA', 180),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            print "Will do %s" % NUM_CASES
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                hex_key = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, doSummary=False)
                h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100)

                print "Parse result['destination_key']:", hex_key
                inspect = h2o_cmd.runInspect(None, hex_key)
                print "Removing", hex_key
                h2o.nodes[0].remove_key(hex_key)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_many_fp_formats.py Proyecto: Julianzz/h2o

    def test_many_fp_formats(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, 'cA', 180),
            (100, 1000, 'cB', 180),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, 
                    timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
                print "\n" + csvFilename

Ejemplo n.º 4

0

Mostrar archivo

    def test_many_cols_and_values_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, 'cA', 30),
            (100, 1000, 'cB', 30),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, 
                    timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
                print "\n" + csvFilename

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_plot_remove_keys.py Proyecto: arosenberger/h2o-dev

def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel):
    # we can do all sorts of methods off the r object
    r = random.Random(SEEDPERFILE)
    NUM_CASES = h2o_util.fp_format()
    if sel and (sel<0 or sel>=NUM_CASES):
        raise Exception("sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES))
    ## MIN = -1e20
    ## MAX = 1e20

    dsf = open(csvPathname, "w+")
    for i in range(rowCount):
        val = r.triangular(-1e9,1e9,0)
        s = h2o_util.fp_format(val, sel=sel) # use same format for all numbers
        rowData = [s for j in range(colCount)]
        rowDataCsv = ",".join(rowData) + "\n"
        dsf.write(rowDataCsv)
    dsf.close()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_plot_remove_keys.py Proyecto: arosenberger/h2o-dev

    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 100, 'cG', 400),
            (200000, 100, 'cH', 400),
            (400000, 100, 'cI', 400),
            (800000, 100, 'cJ', 400),
            (1000000, 100, 'cK', 400),
        ]
        
        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES-1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parseElapsed = pA.python_elapsed
            parse_key = pA.parse_key
            byteSize = pA.byteSize
            numRows = iA.numRows
            numCols = iA.numCols
            print parse_key, parseElapsed, byteSize, numRows, numCols

            labelList = iA.labelList
            node = h2o.nodes[0]

            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(byteSize)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'byteSize'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_plot_remove_keys.py Proyecto: letsflykite/h2o-dev

def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel):
    # we can do all sorts of methods off the r object
    r = random.Random(SEEDPERFILE)
    NUM_CASES = h2o_util.fp_format()
    if sel and (sel < 0 or sel >= NUM_CASES):
        raise Exception(
            "sel used to select from possible fp formats is out of range: %s %s",
            (sel, NUM_CASES))
    ## MIN = -1e20
    ## MAX = 1e20

    dsf = open(csvPathname, "w+")
    for i in range(rowCount):
        val = r.triangular(-1e9, 1e9, 0)
        s = h2o_util.fp_format(val, sel=sel)  # use same format for all numbers
        rowData = [s for j in range(colCount)]
        rowDataCsv = ",".join(rowData) + "\n"
        dsf.write(rowDataCsv)
    dsf.close()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_fp_many_cols_fvec.py Proyecto: finid/h2o

    def test_fp_many_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if H2O_SUPPORTS_OVER_500K_COLS:
            tryList = [
                (100, 200000, 'cG', 120, 120),
                (100, 300000, 'cH', 120, 120),
                (100, 400000, 'cI', 120, 120),
                (100, 500000, 'cJ', 120, 120),
                (100, 700000, 'cL', 120, 120),
                (100, 800000, 'cM', 120, 120),
                (100, 900000, 'cN', 120, 120),
                (100, 1000000, 'cO', 120, 120),
                (100, 1200000, 'cK', 120, 120),
            ]
        else:
            print "Restricting number of columns tested to <=500,000"
            tryList = [
                (100, 50000, 'cG', 400, 400),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            h2o.check_sandbox_for_errors()
            print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm_2_fvec.py Proyecto: yangls06/h2o

    def addRandValToRowStuff(colNumber, valMin, valMax, rowData, synColSumDict):
        # colNumber should not be 0, because the output will be there
        
        ## val = r.uniform(MIN,MAX)
        val = r.triangular(valMin,valMax,0)
        valFormatted = h2o_util.fp_format(val, sel)

        # force it to be zero in this range. so we don't print zeroes for svm!
        if (val > valMin/2) and (val < valMax/2):
            return None
        else:
            rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string
            if colNumber in synColSumDict:
                synColSumDict[colNumber] += val # sum of column (dict)
            else:
                synColSumDict[colNumber] = val # sum of column (dict)
            return val

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm_2.py Proyecto: narayana1208/h2o

    def addRandValToRowStuff(colNumber, valMin, valMax, rowData, synColSumDict):
        # colNumber should not be 0, because the output will be there
        
        ## val = r.uniform(MIN,MAX)
        val = r.triangular(valMin,valMax,0)
        valFormatted = h2o_util.fp_format(val, sel)

        # force it to be zero in this range. so we don't print zeroes for svm!
        if (val > valMin/2) and (val < valMax/2):
            return None
        else:
            rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string
            if colNumber in synColSumDict:
                synColSumDict[colNumber] += val # sum of column (dict)
            else:
                synColSumDict[colNumber] = val # sum of column (dict)
            return val

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm_fvec.py Proyecto: smarthomekit/h2o

    def addValToRowStuff(colNumber, val, rowData, synColSumDict):
        # want to add here, so we can have cols with 0 expected value
        # but we need to track max col that actually goes in the libsvm, so we know
        # how many cols should be in the parsed data
        if colNumber in synColSumDict:
            synColSumDict[colNumber] += val # sum of column (dict)
        else:
            synColSumDict[colNumber] = val # sum of column (dict)

        # don't want to print zero values in row data, because if fp format, then h2o will parse to 4 bytes (even if 0)

        valFormatted = h2o_util.fp_format(val, sel)

        if val==0:
            return None
        else:
            rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string
            return val

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm.py Proyecto: BhaskarPros/h2o

    def addValToRowStuff(colNumber, val, rowData, synColSumDict):
        # want to add here, so we can have cols with 0 expected value
        # but we need to track max col that actually goes in the libsvm, so we know
        # how many cols should be in the parsed data
        if colNumber in synColSumDict:
            synColSumDict[colNumber] += val # sum of column (dict)
        else:
            synColSumDict[colNumber] = val # sum of column (dict)

        # don't want to print zero values in row data, because if fp format, then h2o will parse to 4 bytes (even if 0)

        valFormatted = h2o_util.fp_format(val, sel)

        if val == 0:
            return None
        else:
            rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string
            return val

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_many_fp_formats.py Proyecto: 100star/h2o

def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel):
    # we can do all sorts of methods off the r object
    r = random.Random(SEEDPERFILE)

    ## MIN = -1e20
    ## MAX = 1e20
    # okay to use the same value across the whole dataset?
    ## val = r.uniform(MIN,MAX)
    val = r.triangular(-1e9,1e9,0)
    valFormatted = h2o_util.fp_format(val, sel)

    dsf = open(csvPathname, "w+")
    for i in range(rowCount):
        rowData = []
        for j in range(colCount):
            rowData.append(valFormatted) # f should always return string
        rowDataCsv = ",".join(rowData)
        dsf.write(rowDataCsv + "\n")
    dsf.close()

Ejemplo n.º 14

0

Mostrar archivo

def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel):
    # we can do all sorts of methods off the r object
    r = random.Random(SEEDPERFILE)

    ## MIN = -1e20
    ## MAX = 1e20
    # okay to use the same value across the whole dataset?
    ## val = r.uniform(MIN,MAX)
    val = r.triangular(-1e9, 1e9, 0)
    valFormatted = h2o_util.fp_format(val, sel)

    dsf = open(csvPathname, "w+")
    for i in range(rowCount):
        rowData = []
        for j in range(colCount):
            rowData.append(valFormatted)  # f should always return string
        rowDataCsv = ",".join(rowData)
        dsf.write(rowDataCsv + "\n")
    dsf.close()

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: afoketunji/h2o

    def test_exec2_xorsum2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(20):
            ullResultList = []
            NUM_FORMAT_CASES = h2o_util.fp_format()
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname

                sel = random.randint(0, NUM_FORMAT_CASES-1)
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                
                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    start = time.time()
                    (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                        resultKey=None, timeoutSecs=300)
                    print 'exec took', time.time() - start, 'seconds'
                    print "execResult:", h2o.dump_json(execResult)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    ullResultList.append((ullResult, fpResult))

                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                    print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                    print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                    # allow diff of the lsb..either way. needed when integers are parsed

                    # okay for a couple of lsbs to be wrong, due to conversion from stringk
                    # ullResult (0.16x): 0x02c1a21f923cee96   2.15698793923e-295
                    # expectedUllSum (0.16x): 0x02c1a21f923cee97   2.15698793923e-295
                    # expectedFpSum (0.16x): 0x42f054af32b3c408   2.87294442126e+14

                    # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them.
                    # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues
                    ALLOWED_BIT_ERR = 0x1f # seeing this amount of error!
                    if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR):
                        raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum))
                        print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)

                    # print "%30s" % "hex(bitResult):", hex(ullResult)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: yangls06/h2o

def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel):
    dsf = open(csvPathname, 'w')
    expectedRange = (expectedMax - expectedMin)
    expectedFpSum = float(0)
    expectedUllSum = int(0)
    for row in range(rowCount):
        rowData = []
        for j in range(colCount):
            value = expectedMin + (random.random() * expectedRange)
            if 1==1:
                # value = row * 2

                # bad sum
                # value = 5555555555555 + row
                # bad
                # value = 555555555555 + row
                # value = 55555555555 + row

                # fail
                # value = 5555555555 + row
                # exp = random.randint(0,120)
                # 50 bad?

                # constrain the dynamic range of the numbers to be within IEEE-754 support
                # without loss of precision when adding. Why do we care though?
                # could h2o compress if values are outside that kind of dynamic range ?

                # we want a big exponent?
                exp = random.randint(40,71)
                # skip over the current bug around int boundaries?
                # have a fixed base
                value = random.random() + (2 ** exp) 

                # value = -1 * value
                # value = 2e9 + row
                # value = 3 * row
            r = random.randint(0,1)
            if False and r==0:
                value = -1 * value
            # hack

            # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum

            # Now that you know how many decimals you want, 
            # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s:
            # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA
            if 1==0:
                # get the expected patterns from python
                fpResult = float(value)
                expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
                expectedFpSum += fpResult
                s = ("%.16f" % value).rstrip("0")
                # since we're printing full fp precision always here, we shouldn't have 
                # to suck the formatted fp string (shorter?) back in
            # use a random fp format (string). use sel to force one you like
            else:
                NUM_CASES = h2o_util.fp_format()
                # s = h2o_util.fp_format(value, sel=None) # random
                s = h2o_util.fp_format(value, sel=sel, only='e') # use same case for all numbers
                # FIX! strip the trailing zeroes for now because they trigger a bug
                s = s.rstrip("0")
                # now our string formatting will lead to different values when we parse and use it 
                # so we move the expected value generation down here..i.e after we've formatted the string
                # we'll suck it back in as a fp number
                # get the expected patterns from python
                fpResult = float(s)
                expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
                expectedFpSum += fpResult
            # s = ("%.16e" % value)
            rowData.append(s)

        rowDataCsv = ",".join(map(str,rowData))
        dsf.write(rowDataCsv + "\n")

    dsf.close()
    # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan
    # print hex(~(0xf << 60))
    expectedUllSum &= (~(0xf << 60))
    return (expectedUllSum, expectedFpSum)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_plot_remove_keys.py Proyecto: letsflykite/h2o-dev

    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 100, 'cG', 400),
            (200000, 100, 'cH', 400),
            (400000, 100, 'cI', 400),
            (800000, 100, 'cJ', 400),
            (1000000, 100, 'cK', 400),
        ]

        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES - 1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parseElapsed = pA.python_elapsed
            parse_key = pA.parse_key
            byteSize = pA.byteSize
            numRows = iA.numRows
            numCols = iA.numCols
            print parse_key, parseElapsed, byteSize, numRows, numCols

            labelList = iA.labelList
            node = h2o.nodes[0]

            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(byteSize)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'byteSize'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)

Ejemplo n.º 18

0

Mostrar archivo

    def test_many_fp_formats_libsvm(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30, 'sparse'),
            (100, 100, 'cF', 30, 'sparse50'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict,
                 colNumberMax) = write_syn_dataset(csvPathname, rowCount,
                                                   colCount, SEEDPERFILE, sel,
                                                   distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0, key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2,
                                                       timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use numCols?. numCols should be <= colCount.

                colSumList = h2e.exec_expr_list_across_cols(
                    None,
                    exprList,
                    selKey2,
                    maxCol=colNumberMax + 1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" %
                        (k, len(colSumList), numCols))

                    syn = {}
                    if k == 0:
                        syn['name'] = "C1"
                        syn['type'] = {'Int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn['name'] = "C2"
                        syn['type'] = {'Int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        # syn['scale'] = {1}
                    else:
                        syn['name'] = "C" + str(k + 1)
                        syn['type'] = {'Int', 'Real'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn['naCnt'] = 0
                    syn['cardinality'] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols['min'], cols['max']
                                print "syn min/max:", syn['min'], syn['max']
                                raise Exception(
                                    'col %s %s %s should be in this allowed %s'
                                    % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg='col %s %s %s should be %s' %
                                (k, synKey, cols[synKey], syn[synKey]))

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v),
                        colSum,
                        places=0,
                        msg='%0.6f col sum is not equal to expected %0.6f' %
                        (v, colSum))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm.py Proyecto: BhaskarPros/h2o

    def test_many_fp_formats_libsvm (self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30,'sparse'),
            (100, 100, 'cF', 30,'sparse50'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict, colNumberMax)  = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)


                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use num_cols?. num_cols should be <= colCount. 

                colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if k > colNumberMax: # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols))

                    syn = {}
                    if k==0: 
                        syn['name'] = "Target"
                        syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA)
                        syn['type'] = {'int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        syn['scale'] = {1}
                        # syn['base'] = 0
                        # syn['variance'] = 0
                    elif k==1: # we forced this to always be 0
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1}
                        syn['type'] = {'int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        syn['scale'] = {1}
                        syn['base'] = 0
                        syn['variance'] = 0
                    else:
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check
                        syn['type'] = {'int', 'float'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        syn['scale'] = {1,10,100,1000}
                        # syn['base'] = 0
                        # syn['variance'] = 0

                    syn['num_missing_values'] = 0
                    syn['enum_domain_size'] = 0
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but 
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                # for debug of why it was a bad size
                                print "cols size/min/max:", cols['size'], cols['min'], cols['max']
                                print "syn size/min/max:", syn['size'], syn['min'], syn['max']
                                raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(syn[synKey], cols[synKey],
                                msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey]))
                    
                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(float(v), colSum, places=0, 
                        msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: smarthomekit/h2o

def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                      expectedMax, SEEDPERFILE, sel):
    # this only does the sum stuff for single cols right now
    if colCount != 1:
        raise Exception("only support colCount == 1 here right now %s",
                        colCount)

    NUM_CASES = h2o_util.fp_format()
    if sel and (sel < 0 or sel >= NUM_CASES):
        raise Exception(
            "sel used to select from possible fp formats is out of range: %s %s",
            (sel, NUM_CASES))

    dsf = open(csvPathname, 'w')
    expectedRange = (expectedMax - expectedMin)
    expectedFpSum = float(0)
    expectedUllSum = int(0)
    for row in range(rowCount):
        rowData = []
        for j in range(colCount):

            # Be Nasty!. We know fp compression varies per chunk
            # so...adjust the random fp data, depending on what rows your are at
            # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk

            # Actually: generate "different" data depending on where you are in the rows
            method = row % CHUNKING_CNT

            if method == 1:
                value = expectedMin + (random.random() * expectedRange)
            elif method == 2:
                value = random.randint(1, 1e6)
            elif method == 3:
                value = 5555555555555 + row
            else:  # method == 0 and > 3

                # value = row * 2
                # bad sum
                # value = 5555555555555 + row
                # bad
                # value = 555555555555 + row
                # value = 55555555555 + row

                # fail
                # value = 5555555555 + row
                # exp = random.randint(0,120)
                # 50 bad?

                # constrain the dynamic range of the numbers to be within IEEE-754 support
                # without loss of precision when adding. Why do we care though?
                # could h2o compress if values are outside that kind of dynamic range ?

                # we want a big exponent?
                # was
                # exp = random.randint(40,71)
                exp = random.randint(0, 120)
                # skip over the current bug around int boundaries?
                # have a fixed base
                value = random.random() + (2**exp)

                # value = -1 * value
                # value = 2e9 + row
                # value = 3 * row

            r = random.randint(0, 4)
            # 20% negative
            if DO_NEGATIVE and r == 0:
                value = -1 * value

            # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum

            # Now that you know how many decimals you want,
            # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s:
            # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA

            # use a random fp format (string). use sel to force one you like

            # only keeps it to formats with "e"
            if RANDOM_E_FP_FORMATS:
                # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group
                # s = h2o_util.fp_format(value, sel=None) # this would be random
                s = h2o_util.fp_format(
                    value, sel=None,
                    only='e')  # this would be random, within 'e' only
            else:
                s = h2o_util.fp_format(
                    value, sel=sel,
                    only='e')  # use same format for all numbers

            # FIX! strip the trailing zeroes for now because they trigger a bug
            if DO_BUG:
                pass
            else:
                s = s.rstrip("0")

            # now our string formatting will lead to different values when we parse and use it
            # so we move the expected value generation down here..i.e after we've formatted the string
            # we'll suck it back in as a fp number
            # get the expected patterns from python
            fpResult = float(s)
            expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
            expectedFpSum += fpResult
            # s = ("%.16e" % value)
            rowData.append(s)

            rowDataCsv = ",".join(map(str, rowData))
            dsf.write(rowDataCsv + "\n")

    dsf.close()
    # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan
    # print hex(~(0xf << 60))
    expectedUllSum &= (~(0xf << 60))
    return (expectedUllSum, expectedFpSum)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: 100star/h2o

def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel):
    # this only does the sum stuff for single cols right now
    if colCount!=1:
        raise Exception("only support colCount == 1 here right now %s", colCount)

    NUM_CASES = h2o_util.fp_format()
    if sel and (sel<0 or sel>=NUM_CASES):
        raise Exception("sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES))

    dsf = open(csvPathname, 'w')
    expectedRange = (expectedMax - expectedMin)
    expectedFpSum = float(0)
    expectedUllSum = int(0)
    for row in range(rowCount):
        rowData = []
        for j in range(colCount):

            # Be Nasty!. We know fp compression varies per chunk
            # so...adjust the random fp data, depending on what rows your are at
            # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk

            # Actually: generate "different" data depending on where you are in the rows
            method = row % CHUNKING_CNT
            
            if method==1:
                value = expectedMin + (random.random() * expectedRange)
            elif method==2:
                value = random.randint(1,1e6)
            elif method==3:
                value = 5555555555555 + row
            else: # method == 0 and > 3

                # value = row * 2
                # bad sum
                # value = 5555555555555 + row
                # bad
                # value = 555555555555 + row
                # value = 55555555555 + row

                # fail
                # value = 5555555555 + row
                # exp = random.randint(0,120)
                # 50 bad?

                # constrain the dynamic range of the numbers to be within IEEE-754 support
                # without loss of precision when adding. Why do we care though?
                # could h2o compress if values are outside that kind of dynamic range ?

                # we want a big exponent?
                # was
                # exp = random.randint(40,71)
                exp = random.randint(0,120)
                # skip over the current bug around int boundaries?
                # have a fixed base
                value = random.random() + (2 ** exp) 

                # value = -1 * value
                # value = 2e9 + row
                # value = 3 * row

            r = random.randint(0,4)
            # 20% negative
            if DO_NEGATIVE and r==0:
                value = -1 * value

            # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum

            # Now that you know how many decimals you want, 
            # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s:
            # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA

            # use a random fp format (string). use sel to force one you like

            # only keeps it to formats with "e"
            if RANDOM_E_FP_FORMATS:
                # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group
                # s = h2o_util.fp_format(value, sel=None) # this would be random
                s = h2o_util.fp_format(value, sel=None, only='e') # this would be random, within 'e' only
            else:
                s = h2o_util.fp_format(value, sel=sel, only='e') # use same format for all numbers

            # FIX! strip the trailing zeroes for now because they trigger a bug
            if DO_BUG:
                pass
            else:
                s = s.rstrip("0")

            # now our string formatting will lead to different values when we parse and use it 
            # so we move the expected value generation down here..i.e after we've formatted the string
            # we'll suck it back in as a fp number
            # get the expected patterns from python
            fpResult = float(s)
            expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
            expectedFpSum += fpResult
            # s = ("%.16e" % value)
            rowData.append(s)

            rowDataCsv = ",".join(map(str,rowData))
            dsf.write(rowDataCsv + "\n")

    dsf.close()
    # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan
    # print hex(~(0xf << 60))
    expectedUllSum &= (~(0xf << 60))
    return (expectedUllSum, expectedFpSum)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test_plot_remove_keys.py Proyecto: 100star/h2o

    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 50, 'cG', 400, 400),
            (200000, 50, 'cH', 400, 400),
            (400000, 50, 'cI', 400, 400),
            (800000, 50, 'cJ', 400, 400),
            (1000000, 50, 'cK', 400, 400),
        ]
        
        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES-1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)

Ejemplo n.º 23

0

Mostrar archivo

    def test_many_fp_formats_libsvm_2_fvec(self):
        #h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax,
                 synColSumDict) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, SEEDPERFILE, sel,
                                                    distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False,
                                               parser_type='SVMLight')
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             max_column_display=colNumberMax +
                                             1,
                                             timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseResult['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(
                        key=selKey2,
                        max_column_display=colNumberMax + 1,
                        timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(
                        None,
                        exprList,
                        selKey2,
                        maxCol=colNumberMax + 1,
                        timeoutSecs=timeoutSecs,
                        print_params=False)
                    #print "\n*************"
                    #print "colResultList", colResultList
                    #print "*************"

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset

                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
                print sortedColSumDict
                for k, v in sortedColSumDict.iteritems():
                    print k
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k >= 0 and k < len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(
                            v,
                            compare,
                            places=0,
                            msg='%0.6f col sum is not equal to expected %0.6f'
                            % (v, compare))

                    synMean = (v + 0.0) / rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    if not h2o_util.approxEqual(mean, synMean, tol=1e-3):
                        execExpr = 'sum(%s[,%s])' % (selKey2, k + 1)
                        resultExec = h2o_cmd.runExec(str=execExpr,
                                                     timeoutSecs=300)
                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(
                            resultExec)
                        print "Result of remembered sum on failing col:..:", k, v
                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
                        print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean
                        sys.stdout.flush()
                        raise Exception(
                            'col %s mean %0.6f is not equal to generated mean %0.6f'
                            % (k, mean, synMean))

                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be 0' %
                                     (k, naCnt))

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm_2.py Proyecto: narayana1208/h2o

    def test_many_fp_formats_libsvm_2(self):
        h2o.beta_features = True
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, 
                    timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight')
                print csvFilename, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300, noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                        timeoutSecs=timeoutSecs)
                    print "\n*************"
                    print "colResultList", colResultList
                    print "*************"

                self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                ### print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k>=0 and k<len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(v, compare, places=0, 
                            msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare))

                    synMean = (v + 0.0)/rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    self.assertAlmostEqual(mean, synMean, places=0,
                        msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

                    num_missing_values = inspect['cols'][k]['num_missing_values']
                    self.assertEqual(0, num_missing_values,
                        msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: afoketunji/h2o

def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel):
    dsf = open(csvPathname, 'w')
    expectedRange = (expectedMax - expectedMin)
    expectedFpSum = float(0)
    expectedUllSum = int(0)
    for row in range(rowCount):
        rowData = []
        for j in range(colCount):
            value = expectedMin + (random.random() * expectedRange)
            if 1==1:
                # value = row * 2

                # bad sum
                # value = 5555555555555 + row
                # bad
                # value = 555555555555 + row
                # value = 55555555555 + row

                # fail
                # value = 5555555555 + row
                # exp = random.randint(0,120)
                # 50 bad?

                # constrain the dynamic range of the numbers to be within IEEE-754 support
                # without loss of precision when adding. Why do we care though?
                # could h2o compress if values are outside that kind of dynamic range ?

                # we want a big exponent?
                exp = random.randint(0,50)
                value = random.random() + (2 ** exp) 

                # value = -1 * value
                # value = 2e9 + row
                # value = 3 * row
            r = random.randint(0,1)
            if False and r==0:
                value = -1 * value
            # hack

            # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum

            # Now that you know how many decimals you want, 
            # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s:
            # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA
            if 1==0:
                # get the expected patterns from python
                fpResult = float(value)
                expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
                expectedFpSum += fpResult
                s = ("%.16f" % value).rstrip("0")
                # since we're printing full fp precision always here, we shouldn't have 
                # to suck the formatted fp string (shorter?) back in
            # use a random fp format (string). use sel to force one you like
            else:
                NUM_CASES = h2o_util.fp_format()
                # s = h2o_util.fp_format(value, sel=None) # random
                s = h2o_util.fp_format(value, sel=sel) # use same case for all numbers
                # now our string formatting will lead to different values when we parse and use it 
                # so we move the expected value generation down here..i.e after we've formatted the string
                # we'll suck it back in as a fp number
                # get the expected patterns from python
                fpResult = float(s)
                expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult)
                expectedFpSum += fpResult
            # s = ("%.16e" % value)
            rowData.append(s)

        rowDataCsv = ",".join(map(str,rowData))
        dsf.write(rowDataCsv + "\n")

    dsf.close()
    # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan
    # print hex(~(0xf << 60))
    expectedUllSum &= (~(0xf << 60))
    return (expectedUllSum, expectedFpSum)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: test_many_fp_formats_libsvm_fvec.py Proyecto: tanthm/h2o-2

    def test_many_fp_formats_libsvm_fvec(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, "cA", 30, "sparse50"),
            (100, 10, "cB", 30, "sparse"),
            (100000, 100, "cC", 30, "sparse"),
            (1000, 10, "cD", 30, "sparse50"),
            (100, 100, "cE", 30, "sparse"),
            (100, 100, "cF", 30, "sparse50"),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict, colNumberMax) = write_syn_dataset(
                    csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution
                )

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=selKey2, timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult["destination_key"]
                inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
                numCols = inspect["numCols"]
                numRows = inspect["numRows"]
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult["destination_key"], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols),
                )

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                colSumList = h2e.exec_expr_list_across_cols(
                    None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs
                )

                self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols),
                    )

                    syn = {}
                    if k == 0:
                        syn["name"] = "C1"
                        syn["type"] = {"Int"}
                        syn["min"] = classMin
                        syn["max"] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn["name"] = "C2"
                        syn["type"] = {"Int"}
                        syn["min"] = 0
                        syn["max"] = 0
                        # syn['scale'] = {1}
                    else:
                        syn["name"] = "C" + str(k + 1)
                        syn["type"] = {"Int", "Real"}
                        syn["min"] = valMin
                        syn["max"] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn["naCnt"] = 0
                    syn["cardinality"] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect["cols"][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == "min":
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg="col %s %s %s should be <= %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )
                        elif synKey == "max":
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg="col %s %s %s should be >= %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )
                        elif synKey == "type":
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols["min"], cols["max"]
                                print "syn min/max:", syn["min"], syn["max"]
                                raise Exception(
                                    "col %s %s %s should be in this allowed %s" % (k, synKey, cols[synKey], syn[synKey])
                                )
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg="col %s %s %s should be %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v), colSum, places=0, msg="%0.6f col sum is not equal to expected %0.6f" % (v, colSum)
                    )

Ejemplo n.º 27

0

Mostrar archivo

    def test_plot_remove_keys(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 50, 'cG', 400, 400),
            (200000, 50, 'cH', 400, 400),
            (400000, 50, 'cI', 400, 400),
            (800000, 50, 'cJ', 400, 400),
            (1000000, 50, 'cK', 400, 400),
        ]

        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES - 1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult[
                'destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test_exec2_xorsum2.py Proyecto: smarthomekit/h2o

    def test_exec2_xorsum2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(3):
            ullResultList = []
            NUM_FORMAT_CASES = h2o_util.fp_format()
            for (rowCount, colCount, hex_key, expectedMin, expectedMax,
                 expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(
                    None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname

                sel = random.randint(0, NUM_FORMAT_CASES - 1)
                (expectedUllSum, expectedFpSum) = write_syn_dataset(
                    csvPathname, rowCount, colCount, expectedMin, expectedMax,
                    SEEDPERFILE, sel)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(
                    expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(
                    expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=3000,
                                               retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for repeate in range(3):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0],
                                                               execExpr,
                                                               resultKey=None,
                                                               timeoutSecs=300)
                        print 'exec took', time.time() - start, 'seconds'
                        print "execResult:", h2o.dump_json(execResult)
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                            expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                            expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way. needed when integers are parsed

                        # okay for a couple of lsbs to be wrong, due to conversion from stringk
                        # ullResult (0.16x): 0x02c1a21f923cee96   2.15698793923e-295
                        # expectedUllSum (0.16x): 0x02c1a21f923cee97   2.15698793923e-295
                        # expectedFpSum (0.16x): 0x42f054af32b3c408   2.87294442126e+14

                        # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them.
                        # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues
                        if ullResult != expectedUllSum and (
                                abs(ullResult - expectedUllSum) >
                                ALLOWED_DELTA):
                            emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (
                                ullResult, expectedUllSum)
                            if STOP_ON_ERROR:
                                raise Exception(emsg)
                            else:
                                print emsg

                        # print "%30s" % "hex(bitResult):", hex(ullResult)

                    h2o.check_sandbox_for_errors()

                    print "first result was from a sum. others are xorsum"
                    print "ullResultList:"
                    for ullResult, fpResult in ullResultList:
                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)

                    print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                        expectedUllSum, expectedUllSumAsDouble)
                    print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                        expectedFpSumAsLongLong, expectedFpSum)