def test_many_fp_formats(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (100, 100, 'cB', 180), (100000, 10, 'cA', 180), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() print "Will do %s" % NUM_CASES for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) hex_key = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100) print "Parse result['destination_key']:", hex_key inspect = h2o_cmd.runInspect(None, hex_key) print "Removing", hex_key h2o.nodes[0].remove_key(hex_key)
def test_many_fp_formats(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (100, 100, 'cB', 180), (100000, 10, 'cA', 180), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() print "Will do %s" % NUM_CASES for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) hex_key = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100) print "Parse result['destination_key']:", hex_key inspect = h2o_cmd.runInspect(None, hex_key) print "Removing", hex_key h2o.nodes[0].remove_key(hex_key)
def test_many_fp_formats(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 180), (100, 1000, 'cB', 180), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename
def test_many_cols_and_values_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 30), (100, 1000, 'cB', 30), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename
def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel): # we can do all sorts of methods off the r object r = random.Random(SEEDPERFILE) NUM_CASES = h2o_util.fp_format() if sel and (sel<0 or sel>=NUM_CASES): raise Exception("sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) ## MIN = -1e20 ## MAX = 1e20 dsf = open(csvPathname, "w+") for i in range(rowCount): val = r.triangular(-1e9,1e9,0) s = h2o_util.fp_format(val, sel=sel) # use same format for all numbers rowData = [s for j in range(colCount)] rowDataCsv = ",".join(rowData) + "\n" dsf.write(rowDataCsv) dsf.close()
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES-1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key byteSize = pA.byteSize numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, byteSize, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(byteSize) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel): # we can do all sorts of methods off the r object r = random.Random(SEEDPERFILE) NUM_CASES = h2o_util.fp_format() if sel and (sel < 0 or sel >= NUM_CASES): raise Exception( "sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) ## MIN = -1e20 ## MAX = 1e20 dsf = open(csvPathname, "w+") for i in range(rowCount): val = r.triangular(-1e9, 1e9, 0) s = h2o_util.fp_format(val, sel=sel) # use same format for all numbers rowData = [s for j in range(colCount)] rowDataCsv = ",".join(rowData) + "\n" dsf.write(rowDataCsv) dsf.close()
def test_fp_many_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if H2O_SUPPORTS_OVER_500K_COLS: tryList = [ (100, 200000, 'cG', 120, 120), (100, 300000, 'cH', 120, 120), (100, 400000, 'cI', 120, 120), (100, 500000, 'cJ', 120, 120), (100, 700000, 'cL', 120, 120), (100, 800000, 'cM', 120, 120), (100, 900000, 'cN', 120, 120), (100, 1000000, 'cO', 120, 120), (100, 1200000, 'cK', 120, 120), ] else: print "Restricting number of columns tested to <=500,000" tryList = [ (100, 50000, 'cG', 400, 400), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o.check_sandbox_for_errors() print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount))
def addRandValToRowStuff(colNumber, valMin, valMax, rowData, synColSumDict): # colNumber should not be 0, because the output will be there ## val = r.uniform(MIN,MAX) val = r.triangular(valMin,valMax,0) valFormatted = h2o_util.fp_format(val, sel) # force it to be zero in this range. so we don't print zeroes for svm! if (val > valMin/2) and (val < valMax/2): return None else: rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string if colNumber in synColSumDict: synColSumDict[colNumber] += val # sum of column (dict) else: synColSumDict[colNumber] = val # sum of column (dict) return val
def addValToRowStuff(colNumber, val, rowData, synColSumDict): # want to add here, so we can have cols with 0 expected value # but we need to track max col that actually goes in the libsvm, so we know # how many cols should be in the parsed data if colNumber in synColSumDict: synColSumDict[colNumber] += val # sum of column (dict) else: synColSumDict[colNumber] = val # sum of column (dict) # don't want to print zero values in row data, because if fp format, then h2o will parse to 4 bytes (even if 0) valFormatted = h2o_util.fp_format(val, sel) if val==0: return None else: rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string return val
def addValToRowStuff(colNumber, val, rowData, synColSumDict): # want to add here, so we can have cols with 0 expected value # but we need to track max col that actually goes in the libsvm, so we know # how many cols should be in the parsed data if colNumber in synColSumDict: synColSumDict[colNumber] += val # sum of column (dict) else: synColSumDict[colNumber] = val # sum of column (dict) # don't want to print zero values in row data, because if fp format, then h2o will parse to 4 bytes (even if 0) valFormatted = h2o_util.fp_format(val, sel) if val == 0: return None else: rowData.append(str(colNumber) + ":" + valFormatted) # f should always return string return val
def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel): # we can do all sorts of methods off the r object r = random.Random(SEEDPERFILE) ## MIN = -1e20 ## MAX = 1e20 # okay to use the same value across the whole dataset? ## val = r.uniform(MIN,MAX) val = r.triangular(-1e9,1e9,0) valFormatted = h2o_util.fp_format(val, sel) dsf = open(csvPathname, "w+") for i in range(rowCount): rowData = [] for j in range(colCount): rowData.append(valFormatted) # f should always return string rowDataCsv = ",".join(rowData) dsf.write(rowDataCsv + "\n") dsf.close()
def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel): # we can do all sorts of methods off the r object r = random.Random(SEEDPERFILE) ## MIN = -1e20 ## MAX = 1e20 # okay to use the same value across the whole dataset? ## val = r.uniform(MIN,MAX) val = r.triangular(-1e9, 1e9, 0) valFormatted = h2o_util.fp_format(val, sel) dsf = open(csvPathname, "w+") for i in range(rowCount): rowData = [] for j in range(colCount): rowData.append(valFormatted) # f should always return string rowDataCsv = ",".join(rowData) dsf.write(rowDataCsv + "\n") dsf.close()
def test_exec2_xorsum2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(20): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES-1) (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues ALLOWED_BIT_ERR = 0x1f # seeing this amount of error! if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR): raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? exp = random.randint(40,71) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,1) if False and r==0: value = -1 * value # hack # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA if 1==0: # get the expected patterns from python fpResult = float(value) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult s = ("%.16f" % value).rstrip("0") # since we're printing full fp precision always here, we shouldn't have # to suck the formatted fp string (shorter?) back in # use a random fp format (string). use sel to force one you like else: NUM_CASES = h2o_util.fp_format() # s = h2o_util.fp_format(value, sel=None) # random s = h2o_util.fp_format(value, sel=sel, only='e') # use same case for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key byteSize = pA.byteSize numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, byteSize, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(byteSize) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_many_fp_formats_libsvm(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_many_fp_formats_libsvm (self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30,'sparse'), (100, 100, 'cF', 30,'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k==0: syn['name'] = "Target" syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k==1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1,10,100,1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue(syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue(syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols['min'], cols['max'] print "syn size/min/max:", syn['size'], syn['min'], syn['max'] raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual(syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): # this only does the sum stuff for single cols right now if colCount != 1: raise Exception("only support colCount == 1 here right now %s", colCount) NUM_CASES = h2o_util.fp_format() if sel and (sel < 0 or sel >= NUM_CASES): raise Exception( "sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): # Be Nasty!. We know fp compression varies per chunk # so...adjust the random fp data, depending on what rows your are at # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk # Actually: generate "different" data depending on where you are in the rows method = row % CHUNKING_CNT if method == 1: value = expectedMin + (random.random() * expectedRange) elif method == 2: value = random.randint(1, 1e6) elif method == 3: value = 5555555555555 + row else: # method == 0 and > 3 # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? # was # exp = random.randint(40,71) exp = random.randint(0, 120) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2**exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0, 4) # 20% negative if DO_NEGATIVE and r == 0: value = -1 * value # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA # use a random fp format (string). use sel to force one you like # only keeps it to formats with "e" if RANDOM_E_FP_FORMATS: # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group # s = h2o_util.fp_format(value, sel=None) # this would be random s = h2o_util.fp_format( value, sel=None, only='e') # this would be random, within 'e' only else: s = h2o_util.fp_format( value, sel=sel, only='e') # use same format for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug if DO_BUG: pass else: s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str, rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): # this only does the sum stuff for single cols right now if colCount!=1: raise Exception("only support colCount == 1 here right now %s", colCount) NUM_CASES = h2o_util.fp_format() if sel and (sel<0 or sel>=NUM_CASES): raise Exception("sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): # Be Nasty!. We know fp compression varies per chunk # so...adjust the random fp data, depending on what rows your are at # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk # Actually: generate "different" data depending on where you are in the rows method = row % CHUNKING_CNT if method==1: value = expectedMin + (random.random() * expectedRange) elif method==2: value = random.randint(1,1e6) elif method==3: value = 5555555555555 + row else: # method == 0 and > 3 # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? # was # exp = random.randint(40,71) exp = random.randint(0,120) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,4) # 20% negative if DO_NEGATIVE and r==0: value = -1 * value # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA # use a random fp format (string). use sel to force one you like # only keeps it to formats with "e" if RANDOM_E_FP_FORMATS: # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group # s = h2o_util.fp_format(value, sel=None) # this would be random s = h2o_util.fp_format(value, sel=None, only='e') # this would be random, within 'e' only else: s = h2o_util.fp_format(value, sel=sel, only='e') # use same format for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug if DO_BUG: pass else: s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 50, 'cG', 400, 400), (200000, 50, 'cH', 400, 400), (400000, 50, 'cI', 400, 400), (800000, 50, 'cJ', 400, 400), (1000000, 50, 'cK', 400, 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES-1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_many_fp_formats_libsvm_2(self): h2o.beta_features = True # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset ### print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? self.assertAlmostEqual(mean, synMean, places=0, msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) num_missing_values = inspect['cols'][k]['num_missing_values'] self.assertEqual(0, num_missing_values, msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? exp = random.randint(0,50) value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,1) if False and r==0: value = -1 * value # hack # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA if 1==0: # get the expected patterns from python fpResult = float(value) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult s = ("%.16f" % value).rstrip("0") # since we're printing full fp precision always here, we shouldn't have # to suck the formatted fp string (shorter?) back in # use a random fp format (string). use sel to force one you like else: NUM_CASES = h2o_util.fp_format() # s = h2o_util.fp_format(value, sel=None) # random s = h2o_util.fp_format(value, sel=sel) # use same case for all numbers # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def test_many_fp_formats_libsvm_fvec(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, "cA", 30, "sparse50"), (100, 10, "cB", 30, "sparse"), (100000, 100, "cC", 30, "sparse"), (1000, 10, "cD", 30, "sparse50"), (100, 100, "cE", 30, "sparse"), (100, 100, "cF", 30, "sparse50"), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset( csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution ) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) numCols = inspect["numCols"] numRows = inspect["numRows"] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult["destination_key"], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols), ) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs ) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols), ) syn = {} if k == 0: syn["name"] = "C1" syn["type"] = {"Int"} syn["min"] = classMin syn["max"] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn["name"] = "C2" syn["type"] = {"Int"} syn["min"] = 0 syn["max"] = 0 # syn['scale'] = {1} else: syn["name"] = "C" + str(k + 1) syn["type"] = {"Int", "Real"} syn["min"] = valMin syn["max"] = valMax # syn['scale'] = {1,10,100,1000} syn["naCnt"] = 0 syn["cardinality"] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect["cols"][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == "min": self.assertTrue( syn[synKey] <= cols[synKey], msg="col %s %s %s should be <= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "max": self.assertTrue( syn[synKey] >= cols[synKey], msg="col %s %s %s should be >= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "type": if cols[synKey] not in syn[synKey]: print "cols min/max:", cols["min"], cols["max"] print "syn min/max:", syn["min"], syn["max"] raise Exception( "col %s %s %s should be in this allowed %s" % (k, synKey, cols[synKey], syn[synKey]) ) else: self.assertEqual( syn[synKey], cols[synKey], msg="col %s %s %s should be %s" % (k, synKey, cols[synKey], syn[synKey]), ) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg="%0.6f col sum is not equal to expected %0.6f" % (v, colSum) )
def test_plot_remove_keys(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 50, 'cG', 400, 400), (200000, 50, 'cH', 400, 400), (400000, 50, 'cI', 400, 400), (800000, 50, 'cJ', 400, 400), (1000000, 50, 'cK', 400, 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult[ 'destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_xorsum2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(3): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES - 1) (expectedUllSum, expectedFpSum) = write_syn_dataset( csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for repeate in range(3): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues if ullResult != expectedUllSum and ( abs(ullResult - expectedUllSum) > ALLOWED_DELTA): emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) if STOP_ON_ERROR: raise Exception(emsg) else: print emsg # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)