def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 1, 'r1', 0, 10, None), ] ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUll, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) print "" print "%30s" % "fpResult:", "%.15f" % fpResult ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # print "%30s" % "hex(bitResult):", hex(ullResult) ullResultList.append((ullResult, fpResult)) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll) print "%30s" % "expectedUll (0.16x):", "0x%0.16x %s" % (expectedUll, expectedUllAsDouble) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) r = random.randint(0,1) if False and r==0: value = -1 * value # hack if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row rexp = random.randint(0,20) value = 2.0**rexp + 3.0*row r = random.randint(0,1) if r==0: value = -1 * value # value = -1 * value # value = 2e9 + row # value = 3 * row # get the expected patterns from python fpResult = float(value) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult expectedUllSum = expectedUllSum ^ ullResult # print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # can't rstrip, because it gets rid of trailing exponents like +0 which causes NA if + s = "%.16f" % value rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # print hex(~(0xf << 60)) # zero 4 bits of sign/exponent like h2o does, to prevent inf/nan expectedUllSum = expectedUllSum & ~(0xf << 60) return (expectedUllSum, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) r = random.randint(0, 1) if False and r == 0: value = -1 * value # hack if 1 == 1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row rexp = random.randint(0, 20) value = 2.0**rexp + 3.0 * row r = random.randint(0, 1) if r == 0: value = -1 * value # value = -1 * value # value = 2e9 + row # value = 3 * row # get the expected patterns from python fpResult = float(value) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult expectedUllSum = expectedUllSum ^ ullResult # print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # can't rstrip, because it gets rid of trailing exponents like +0 which causes NA if + s = "%.16f" % value rowData.append(s) rowDataCsv = ",".join(map(str, rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # print hex(~(0xf << 60)) # zero 4 bits of sign/exponent like h2o does, to prevent inf/nan expectedUllSum = expectedUllSum & ~(0xf << 60) return (expectedUllSum, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = 0.0 for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? exp = random.randint(0,10) value = 3 * (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,1) if False and r==0: value = -1 * value # hack # get the expected patterns from python fpResult = float(value) expectedUll = h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # s = ("%.16e" % value).rstrip("0") s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() return (expectedUll, expectedFpSum)
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_parse_multiprocess_fvec(self): h2o.beta_features = True # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True OUTSTANDING = min(10, len(h2o.nodes)) if DO_IRIS: global DO_BIGFILE DO_BIGFILE = False bucket = 'smalldata' importFolderPath = "iris" csvFilename = "iris2.csv" csvFilePattern = "iris2.csv" if localhost: trialMax = 20 else: trialMax = 100 elif DO_BIGFILE: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype20x.data" csvFilePattern = "covtype20x.data" trialMax = 2 * OUTSTANDING else: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype.data" csvFilePattern = "covtype.data" trialMax = 40 * OUTSTANDING # add one just to make it odd # OUTSTANDING = min(10, len(h2o.nodes) + 1) # don't have more than one source file per node OUTSTANDING? (think of the node increment rule) # okay to reuse the src_key name. h2o deletes? use unique hex to make sure it's not reused. # might go to unique src keys also ..oops have to, to prevent complaints about the key (lock) # can't repeatedly import the folder # only if not noPoll. otherwise parse isn't done # I guess I have to use 'put' so I can name the src key unique, to get overlap # I could tell h2o to not delete, but it's nice to get the keys in a new place? # maybe rebalance? FIX! todo parseTrial = 0 summaryTrial = 0 uploader_resultq = multiprocessing.Queue() while parseTrial <= trialMax: start = time.time() uploaders = [] if not DO_IRIS: assert OUTSTANDING<=10 , "we only have 10 links with unique names to covtype.data" for o in range(OUTSTANDING): src_key = csvFilename + "_" + str(parseTrial) hex_key = csvFilename + "_" + str(parseTrial) + ".hexxx" # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", # hacked hard ln so source keys would have different names? was getting h2o locking issues if DO_IRIS: csvPathname = importFolderPath + "/" + csvFilePattern else: csvPathname = importFolderPath + "/" + csvFilePattern + "_" + str(o) start = time.time() # walk the nodes # if this rule is matched for exec/summary below, it should find the name okay? (npe with xorsum) # summary2 not seeing it? np = parseTrial % len(h2o.nodes) retryDelaySecs=5 if DO_BIGFILE else 1 timeoutSecs=60 if DO_BIGFILE else 15 tmp = multiprocessing.Process(target=function_no_keyboard_intr, args=(uploader_resultq, uploadit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() uploaders.append(tmp) parseTrial += 1 # now sync on them for uploader in uploaders: try: uploader.join() # don't need him any more uploader.terminate() (importPattern, hex_key) = uploader_resultq.get(timeout=10) except KeyboardInterrupt: print 'parent received ctrl-c' for uploader in uploaders: uploader.terminate() uploader.join() elapsed = time.time() - start print "Parse group end at #", parseTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We might have parses that haven't completed. The join just says we can reuse some files (parse still going)" if PARSE_NOPOLL: h2o_jobs.pollWaitJobs(timeoutSecs=180) h2o_cmd.runStoreView() # h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=0.25) if DO_PARSE_ALSO: # only if we parsed print "These all go to node [0]" # getting a NPE if I do xorsum (any exec?) ..just do summary for now..doesn't seem to have the issue # suspect it's about the multi-node stuff above for summaryTrial in range(trialMax): # do last to first..to get race condition? firstXorUll = None firstQuantileUll = None hex_key = csvFilename + "_" + str(summaryTrial) + ".hexxx" if DO_EXEC_QUANT: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "median ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstQuantileUll: self.assertEqual(ullResult, firstQuantileUll) else: firstQuantileUll = ullResult if DO_XORSUM: execExpr = "r2=c(1); r2=xorsum(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "xorsum ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstXorUll: self.assertEqual(ullResult, firstXorUll) else: firstXorUll = ullResult if DO_SUMMARY: h2o_cmd.runSummary(key=hex_key)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? exp = random.randint(0,50) value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,1) if False and r==0: value = -1 * value # hack # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA if 1==0: # get the expected patterns from python fpResult = float(value) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult s = ("%.16f" % value).rstrip("0") # since we're printing full fp precision always here, we shouldn't have # to suck the formatted fp string (shorter?) back in # use a random fp format (string). use sel to force one you like else: NUM_CASES = h2o_util.fp_format() # s = h2o_util.fp_format(value, sel=None) # random s = h2o_util.fp_format(value, sel=sel) # use same case for all numbers # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def test_exec2_xorsum2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(20): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES-1) (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues ALLOWED_BIT_ERR = 0x1f # seeing this amount of error! if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR): raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult != expectedUllSum: raise Exception( "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)
def test_parse_multiprocess_fvec(self): h2o.beta_features = True # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True OUTSTANDING = min(10, len(h2o.nodes)) if DO_IRIS: global DO_BIGFILE DO_BIGFILE = False bucket = 'smalldata' importFolderPath = "iris" csvFilename = "iris2.csv" csvFilePattern = "iris2.csv" if localhost: trialMax = 20 else: trialMax = 100 elif DO_BIGFILE: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype20x.data" csvFilePattern = "covtype20x.data" trialMax = 2 * OUTSTANDING else: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype.data" csvFilePattern = "covtype.data" trialMax = 40 * OUTSTANDING # add one just to make it odd # OUTSTANDING = min(10, len(h2o.nodes) + 1) # don't have more than one source file per node OUTSTANDING? (think of the node increment rule) # okay to reuse the src_key name. h2o deletes? use unique hex to make sure it's not reused. # might go to unique src keys also ..oops have to, to prevent complaints about the key (lock) # can't repeatedly import the folder # only if not noPoll. otherwise parse isn't done # I guess I have to use 'put' so I can name the src key unique, to get overlap # I could tell h2o to not delete, but it's nice to get the keys in a new place? # maybe rebalance? FIX! todo parseTrial = 0 summaryTrial = 0 uploader_resultq = multiprocessing.Queue() while parseTrial <= trialMax: start = time.time() uploaders = [] if not DO_IRIS: assert OUTSTANDING<=10 , "we only have 10 links with unique names to covtype.data" for o in range(OUTSTANDING): src_key = csvFilename + "_" + str(parseTrial) hex_key = csvFilename + "_" + str(parseTrial) + ".hexxx" # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", # hacked hard ln so source keys would have different names? was getting h2o locking issues if DO_IRIS: csvPathname = importFolderPath + "/" + csvFilePattern else: csvPathname = importFolderPath + "/" + csvFilePattern + "_" + str(o) start = time.time() # walk the nodes # if this rule is matched for exec/summary below, it should find the name okay? (npe with xorsum) # summary2 not seeing it? np = parseTrial % len(h2o.nodes) retryDelaySecs=5 if DO_BIGFILE else 1 timeoutSecs=60 if DO_BIGFILE else 15 tmp = multiprocessing.Process(target=function_no_keyboard_intr, args=(uploader_resultq, uploadit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() uploaders.append(tmp) parseTrial += 1 # now sync on them for uploader in uploaders: try: uploader.join() # don't need him any more uploader.terminate() (importPattern, hex_key) = uploader_resultq.get(timeout=10) except KeyboardInterrupt: print 'parent received ctrl-c' for uploader in uploaders: uploader.terminate() uploader.join() elapsed = time.time() - start print "Parse group end at #", parseTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We might have parses that haven't completed. The join just says we can reuse some files (parse still going)" if PARSE_NOPOLL: h2o_jobs.pollWaitJobs(timeoutSecs=180) h2o_cmd.runStoreView() # h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=0.25) if DO_PARSE_ALSO: # only if we parsed print "These all go to node [0]" # getting a NPE if I do xorsum (any exec?) ..just do summary for now..doesn't seem to have the issue # suspect it's about the multi-node stuff above for summaryTrial in range(trialMax): # do last to first..to get race condition? firstXorUll = None firstQuantileUll = None hex_key = csvFilename + "_" + str(summaryTrial) + ".hexxx" if DO_EXEC_QUANT: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "median ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstQuantileUll: self.assertEqual(ullResult, firstQuantileUll) else: firstQuantileUll = ullResult if DO_XORSUM: execExpr = "r2=c(1); r2=xorsum(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "xorsum ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstXorUll: self.assertEqual(ullResult, firstXorUll) else: firstXorUll = ullResult if DO_SUMMARY: h2o_cmd.runSummary(key=hex_key)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): # this only does the sum stuff for single cols right now if colCount!=1: raise Exception("only support colCount == 1 here right now %s", colCount) NUM_CASES = h2o_util.fp_format() if sel and (sel<0 or sel>=NUM_CASES): raise Exception("sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): # Be Nasty!. We know fp compression varies per chunk # so...adjust the random fp data, depending on what rows your are at # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk # Actually: generate "different" data depending on where you are in the rows method = row % CHUNKING_CNT if method==1: value = expectedMin + (random.random() * expectedRange) elif method==2: value = random.randint(1,1e6) elif method==3: value = 5555555555555 + row else: # method == 0 and > 3 # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? # was # exp = random.randint(40,71) exp = random.randint(0,120) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,4) # 20% negative if DO_NEGATIVE and r==0: value = -1 * value # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA # use a random fp format (string). use sel to force one you like # only keeps it to formats with "e" if RANDOM_E_FP_FORMATS: # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group # s = h2o_util.fp_format(value, sel=None) # this would be random s = h2o_util.fp_format(value, sel=None, only='e') # this would be random, within 'e' only else: s = h2o_util.fp_format(value, sel=sel, only='e') # use same format for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug if DO_BUG: pass else: s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): # this only does the sum stuff for single cols right now if colCount != 1: raise Exception("only support colCount == 1 here right now %s", colCount) NUM_CASES = h2o_util.fp_format() if sel and (sel < 0 or sel >= NUM_CASES): raise Exception( "sel used to select from possible fp formats is out of range: %s %s", (sel, NUM_CASES)) dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): # Be Nasty!. We know fp compression varies per chunk # so...adjust the random fp data, depending on what rows your are at # i.e. cluster results per chunk, smaller variance within chunk, larger variance outside of chunk # Actually: generate "different" data depending on where you are in the rows method = row % CHUNKING_CNT if method == 1: value = expectedMin + (random.random() * expectedRange) elif method == 2: value = random.randint(1, 1e6) elif method == 3: value = 5555555555555 + row else: # method == 0 and > 3 # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? # was # exp = random.randint(40,71) exp = random.randint(0, 120) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2**exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0, 4) # 20% negative if DO_NEGATIVE and r == 0: value = -1 * value # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # old bugs was: can't rstrip if .16e is used because trailing +00 becomes +, causes NA # use a random fp format (string). use sel to force one you like # only keeps it to formats with "e" if RANDOM_E_FP_FORMATS: # s = h2o_util.fp_format(value, sel=sel) # this is e/f/g formats for a particular sel within each group # s = h2o_util.fp_format(value, sel=None) # this would be random s = h2o_util.fp_format( value, sel=None, only='e') # this would be random, within 'e' only else: s = h2o_util.fp_format( value, sel=sel, only='e') # use same format for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug if DO_BUG: pass else: s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str, rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)
def test_exec2_xorsum2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(3): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES - 1) (expectedUllSum, expectedFpSum) = write_syn_dataset( csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for repeate in range(3): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues if ullResult != expectedUllSum and ( abs(ullResult - expectedUllSum) > ALLOWED_DELTA): emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) if STOP_ON_ERROR: raise Exception(emsg) else: print emsg # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel): dsf = open(csvPathname, 'w') expectedRange = (expectedMax - expectedMin) expectedFpSum = float(0) expectedUllSum = int(0) for row in range(rowCount): rowData = [] for j in range(colCount): value = expectedMin + (random.random() * expectedRange) if 1==1: # value = row * 2 # bad sum # value = 5555555555555 + row # bad # value = 555555555555 + row # value = 55555555555 + row # fail # value = 5555555555 + row # exp = random.randint(0,120) # 50 bad? # constrain the dynamic range of the numbers to be within IEEE-754 support # without loss of precision when adding. Why do we care though? # could h2o compress if values are outside that kind of dynamic range ? # we want a big exponent? exp = random.randint(40,71) # skip over the current bug around int boundaries? # have a fixed base value = random.random() + (2 ** exp) # value = -1 * value # value = 2e9 + row # value = 3 * row r = random.randint(0,1) if False and r==0: value = -1 * value # hack # print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x" % expectedUllSum # Now that you know how many decimals you want, # say, 15, just use a rstrip("0") to get rid of the unnecessary 0s: # fix. can't rstrip if .16e is used because trailing +00 becomes +, causes NA if 1==0: # get the expected patterns from python fpResult = float(value) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult s = ("%.16f" % value).rstrip("0") # since we're printing full fp precision always here, we shouldn't have # to suck the formatted fp string (shorter?) back in # use a random fp format (string). use sel to force one you like else: NUM_CASES = h2o_util.fp_format() # s = h2o_util.fp_format(value, sel=None) # random s = h2o_util.fp_format(value, sel=sel, only='e') # use same case for all numbers # FIX! strip the trailing zeroes for now because they trigger a bug s = s.rstrip("0") # now our string formatting will lead to different values when we parse and use it # so we move the expected value generation down here..i.e after we've formatted the string # we'll suck it back in as a fp number # get the expected patterns from python fpResult = float(s) expectedUllSum ^= h2o_util.doubleToUnsignedLongLong(fpResult) expectedFpSum += fpResult # s = ("%.16e" % value) rowData.append(s) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") dsf.close() # zero the upper 4 bits of xorsum like h2o does to prevent inf/nan # print hex(~(0xf << 60)) expectedUllSum &= (~(0xf << 60)) return (expectedUllSum, expectedFpSum)