def test_exec2_quantile_na_scalar(self): for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print 'exec end took', time.time() - start, 'seconds' h2p.blue_print("h2o exec quantiles result:", result) self.assertEqual( result, expectedP, msg="Checking exec quantiles median, expectedP: %s result: %s" % (expectedP, result)) print h2o.dump_json(resultExec) # do the quantiles page on the created key kwargs = { 'column': 0, 'quantile': QUANTILE, 'multiple_pass': 2, 'max_qbins': 1000, } q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertEqual(qresult_iterations, 3, msg="should take 3 iterations") # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating") self.assertEqual( qresult, expectedP, msg="Checking quantilespage median, expectedP: %s result: %s" % (expectedP, qresult)) inspect = h2o_cmd.runInspect(key='abc') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_exec2_quantile_na_scalar(self): h2o.beta_features = True for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print 'exec end took', time.time() - start, 'seconds' h2p.blue_print("h2o exec quantiles result:", result) self.assertEqual(result, expectedP, msg="Checking exec quantiles median, expectedP: %s result: %s" % (expectedP, result)) print h2o.dump_json(resultExec) # do the quantiles page on the created key kwargs = { 'column': 0, 'quantile': QUANTILE, 'multiple_pass': 2, 'max_qbins': 1000, } q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertEqual(qresult_iterations, 3, msg="should take 3 iterations") # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating") self.assertEqual(qresult, expectedP, msg="Checking quantilespage median, expectedP: %s result: %s" % (expectedP, qresult)) inspect = h2o_cmd.runInspect(key='abc') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime < maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
# an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 # this is type 7 alphap=1 betap=1 from scipy import stats a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction') h2p.red_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.red_print("scipy stats.mstats.mquantiles:", a2) targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.red_print("sort algo:", b) h2p.red_print( "from h2o (multi):", quantiles[0]) print "Now looking at the sorted list..same thing" h2p.blue_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.blue_print("scipy stats.mstats.mquantiles:", a2) b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.blue_print("sort algo:", b) h2p.blue_print( "from h2o (multi):", quantiles[0])
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1==0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, **kwargs): # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = 'nfs:/' + folderPath importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "maprfs:///" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 2, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 100, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 1000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if DO_TRY_SCIPY and colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() print scipyCol, pctile[10] generate_scipy_comparison(csvPathnameFull, col=scipyCol, # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single) h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult) h2i.delete_keys_at_all_nodes()
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ('cars.csv', 'c.hex', [ (None, None,None,None,None,None), ('economy (mpg)', None,None,None,None,None), ('cylinders', None,None,None,None,None), ], ), ('runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ('runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ('runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100,00), ], ), ('runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. if schema=='put': h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + "\nMeans multi-machine with 'put' will fail") schema = 'local' if src_key and schema!='put': raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o_nodes.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) verboseprint("head:", head) verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' importResult = None return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder? # is it used for key matching by others? # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets # importPattern = folderURI + "/" + pattern # could include this on the entire importPattern if we no longer have regex basename in h2o-dev? # folderURI = 'nfs:/' + folderPath folderURI = 'nfs:/' + os.path.realpath(folderPath) if importParentDir: finalImportString = folderPath else: finalImportString = folderPath + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o_nodes.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString importPattern = folderURI + "/" + pattern return (importResult, importPattern)
print "stress the 1000 fixed binning based on (max-min)/1000" a = [ -1.0000002e10, -1.0000001e10, -1.0000000e10, -1.0000002e9, -1.0000001e9, -1.0000000e9, -1.0000002e6, -1.0000001e6, -1.0000000e6, -1.0000002e3, -1.0000001e3, -1.0000000e3, -1.0, 0.0000000, 1.0, 1.0000002e3, 1.0000001e3, 1.0000000e3, 1.0000002e6, 1.0000001e6, 1.0000000e6, 1.0000002e9, 1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10 ] initList = ["ddd = c(%s)" % ",".join(map(str, a))] # get expected result a.sort() expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear') print "expectedP:", expectedP h2p.blue_print("sort result, expectedP:", expectedP) exprList = [ ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED SEED = h2o.setup_random_seed() h2o.init(1, java_heap_GB=1)
def test_summary2_small(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 100, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 1000, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg= "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?" ) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, numRows / len(hcnt), delta=1 + .01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, interpolate='linear', quantile=0.50): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = false target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP= map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP= map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 if SCIPY_INSTALLED: p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.5, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) h2o_util.assertApproxEqual(h2oSummary2, b, rel=0.5, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesApprox: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) if h2oQuantilesExact: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesExact, s2, rel=0.01, msg='h2o quantile multipass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesExact, s2, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxDelta = 1 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor(valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt-1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2<0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high==0) or (val < hcnt2_high_min): hcnt2_high_min = val; print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2]==0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val; if hcnt2[binIdx2]==0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val; hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt-1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high==0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high!=0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows-1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows-1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract>=0 and targetCntFract<=1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k==maxBinCnt: break assert k<maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k-1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] assert hcnt2[k]!=1 or hcnt2_min[k]==hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 if currentCnt==targetCntInt: if hcnt2[k]>2 and (hcnt2_min[k]==hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k]==2: print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k]==1 and targetCntFract==0: assert hcnt2_min[k]==hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k]==1 and targetCntFract!=0: assert hcnt2_min[k]==hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k<maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK<maxBinCnt and hcnt2[nextK]==0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high!=0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK]!=0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal) print "Guess D", guess if not done: print "Not done, setting new range",\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k-1]>0 and (hcnt2_max[k-1]<hcnt2_min[k]): newValStart = hcnt2_max[k-1] else: newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k+1]>0 and (hcnt2_min[k+1]>hcnt2_max[k]): print "hello" newValEnd = hcnt2_min[k+1] else: newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize==0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entris in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2550.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxDelta = 0.5 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): if src_key and schema!='put': raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = 'nfs:/' + folderPath if importParentDir: importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \ (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config) # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ( 'cars.csv', 'c.hex', [ (None, None, None, None, None, None), ('economy (mpg)', None, None, None, None, None), ('cylinders', None, None, None, None, None), ], ), ( 'runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ( 'runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ( 'runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100, 00), ], ), ( 'runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles( source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype != 'Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual( mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual( maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype != 'Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10 ] initList = [ "ddd = c(%s)" % ",".join(map(str,a)) ] # get expected result a.sort() expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear') print "expectedP:", expectedP h2p.blue_print("sort result, expectedP:", expectedP) exprList = [ ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED SEED = h2o.setup_random_seed() h2o.init(1, java_heap_GB=1)
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: h2o.beta_features = False if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) scipyCol += 1 trial += 1
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? # hack? maxErr = maxErr * 2 print "maxErr:", maxErr else: print "Test won't calculate max expected error" maxErr = 0 hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) scipyCol += 1 trial += 1
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if not SCIPY_INSTALLED: return if use_genfromtxt: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt(open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype == 'float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype == 'int': targetFP = map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile * 100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile * 100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1 == 0: # type 6 alphap = 0 betap = 0 # type 5 okay but not perfect alphap = 0.5 betap = 0.5 # type 8 alphap = 1 / 3.0 betap = 1 / 3.0 if interpolate == 'mean': # an approx? (was good when comparing to h2o type 2) alphap = 0.4 betap = 0.4 if interpolate == 'linear': # this is type 7 alphap = 1 betap = 1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual( h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual( h2oSummary2, b, tol=h2oSummary2MaxErr, msg= 'h2o summary2 is not approx. same as sort algo (calculated expected max error)' ) else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual( h2oSummary2, b, rel=1.0, msg= 'h2o summary2 is not approx. same as sort algo (sloppy compare)' ) if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual( h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual( h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual( h2oQuantilesExact, s1, tol=0.0000002, msg= 'h2o quantile multipass is not same as scipy stats.scoreatpercentile' ) # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1 == 0: if interpolate == 'mean': h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles' ) else: h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not same as scipy stats.mstats.mquantiles' ) # see if scipy changes. nope. it doesn't if 1 == 0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (5000000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (5000000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (1000000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (1000000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (1000000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (1000000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (1000000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (1000000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (1000000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (1000000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, trial in enumerate(thresholds): execExpr = "quantile(%s[,1], c(%s));" % (hex_key, trial) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) ex = twoDecimals(result) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (trial, ex, pt[i])) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='percentile: % is not expected: %s' % (result, pctile[i])) if DO_TRY_SCIPY: generate_scipy_comparison(csvPathnameFull)
def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None): # this is some hack code for reading the csv and doing some percentile stuff in scipy # from numpy import loadtxt, genfromtxt, savetxt import numpy as np import scipy as sp dataset = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', # skip_header=1, dtype=None); # guess! print "csv read for training, done" # we're going to strip just the last column for percentile work # used below NUMCLASSES = 10 print "csv read for training, done" # data is last column # drop the output print dataset.shape if len(dataset.shape) > 1: target = [x[col] for x in dataset] else: target = dataset # we may have read it in as a string. coerce to number targetFP = np.array(target, np.float) if 1==0: n_features = len(dataset[0]) - 1; print "n_features:", n_features # get the end # target = [x[-1] for x in dataset] # get the 2nd col print "histogram of target" print target print sp.histogram(target, bins=NUMCLASSES) print target[0] print target[1] thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] print "scipy per:", thresholds from scipy import stats # a = stats.scoreatpercentile(target, per=per) a = stats.mstats.mquantiles(targetFP, prob=thresholds) a2 = ["%.2f" % v for v in a] h2p.red_print("scipy stats.mstats.mquantiles:", a2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') label = '50%' if DO_MEDIAN else '99.9%' h2p.blue_print(label, "from sort:", b) s = a[5 if DO_MEDIAN else 10] h2p.blue_print(label, "from scipy:", s) h2p.blue_print(label, "from h2o summary2:", h2oMedian) h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a] h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", a2)
def import_only( node=None, schema="local", bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs ): if src_key and schema != "put": raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) h2o.verboseprint("head:", head) h2o.verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema == "put": # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' importResult = None return (None, key) if schema == "local" and not (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") folderURI = "nfs:/" + folderPath if importParentDir: importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip("/") # strip leading / in head if present if bucket and head != "": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern if h2o.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o.nodes[0] if schema == "s3" or node.redirect_import_folder_to_s3_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "s3n" or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node, ) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "maprfs": if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) elif schema == "hdfs": # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node, ) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs) else: importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
# this is type 7 alphap = 1 betap = 1 from scipy import stats a1 = stats.scoreatpercentile(target, per=100 * OTHER_T, interpolation_method='fraction') h2p.red_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.red_print("scipy stats.mstats.mquantiles:", a2) targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.red_print("sort algo:", b) h2p.red_print("from h2o (multi):", quantiles[0]) print "Now looking at the sorted list..same thing" h2p.blue_print("stats.scoreatpercentile:", a1) a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap) h2p.blue_print("scipy stats.mstats.mquantiles:", a2) b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear') h2p.blue_print("sort algo:", b) h2p.blue_print("from h2o (multi):", quantiles[0])
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = False try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if use_genfromtxt and SCIPY_INSTALLED: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP = map(int, target) if SCIPY_INSTALLED: # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual(h2oSummary2, b, tol=h2oSummary2MaxErr, msg='h2o summary2 is not approx. same as sort algo (calculated expected max error)') else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual(h2oSummary2, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo (sloppy compare)') if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual(h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1==0: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor( valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt - 1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2 < 0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high == 0) or (val < hcnt2_high_min): hcnt2_high_min = val print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2] == 0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val if hcnt2[binIdx2] == 0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt - 1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high == 0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high != 0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows - 1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows - 1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract >= 0 and targetCntFract <= 1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k == maxBinCnt: break assert k < maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % ( k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k - 1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ", k, " ", currentCnt, " ", targetCntInt, " ", totalRows, " ", hcnt2[ k], " ", hcnt2_min[k], " ", hcnt2_max[k] assert hcnt2[k] != 1 or hcnt2_min[k] == hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 # we maight not have gottent all the way if currentCnt == targetCntInt: if hcnt2[k] > 2 and (hcnt2_min[k] == hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k] == 2: print "hello" print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[ k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k] == 1 and targetCntFract == 0: assert hcnt2_min[k] == hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k] == 1 and targetCntFract != 0: assert hcnt2_min[k] == hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k < maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK < maxBinCnt and hcnt2[nextK] == 0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high != 0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK] != 0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[ k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % ( nextK, nextVal) print "Guess D", guess if not done: print "%s %s %s %s Not done, setting new range" % (hcnt2[k], currentCnt, targetCntInt, targetCntFract),\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k - 1] > 0 and (hcnt2_max[k - 1] < hcnt2_min[k]): print "1" newValStart = hcnt2_max[k - 1] else: print "2" newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k + 1] > 0 and (hcnt2_min[k + 1] > hcnt2_max[k]): print "3" newValEnd = hcnt2_min[k + 1] else: print "4" newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize == 0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess # was done = True 3/20/14 done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entries in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else 0.999 q = h2o.nodes[0].quantiles( source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, ) qresult = q["result"] qresult_single = q["result_single"] qresult_iterations = q["iterations"] qresult_interpolated = q["interpolated"] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?", ) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. # if schema=='put': # h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + # "\nMeans multi-machine with 'put' will fail") # schema = 'local' if src_key and schema != 'put': raise Exception( "can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o_nodes.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) verboseprint("head:", head) verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema == 'put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception( "h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) # h2o-dev: it always wants a key name if src_key is None: src_key = filename key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' # ..make it look like an import files result..This is just for test consistency importResult = json.loads('{\ "dels": [],\ "fails": [],\ "files": ["%s"],\ "keys": ["%s"],\ "path": "%s",\ "schema_name": null, "schema_type": null, "schema_version": null\ }' % (filename, src_key, filePath)) return (importResult, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder? # is it used for key matching by others? # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets # importPattern = folderURI + "/" + pattern # could include this on the entire importPattern if we no longer have regex basename in h2o-dev? folderURI = 'nfs:/' + folderPath # folderURI = 'nfs:/' + os.path.realpath(folderPath) if importParentDir: finalImportString = folderPath else: finalImportString = folderPath + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head != "": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head if h2o_args.abort_after_import: raise Exception( "Aborting due to abort_after_import (-aai) argument's effect in import_only()" ) n = h2o_nodes.nodes[0] if schema == 's3' or node.redirect_import_folder_to_s3_path: # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 's3n' or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 'maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema == 'hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % ( n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString importPattern = folderURI + "/" + pattern return (importResult, importPattern)