def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)), (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)), (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)), (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset( csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE ) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != "" and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5*ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5*ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1*ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1*ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1*ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1*ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1*ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100,00]), (1*ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1*ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str,probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model( algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_percentile2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {} for x in range(expectedMin, expectedMax): legalValues[x] = x write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: e = .1 * rowCount self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) print "pctile:", pctile print "maxs:", maxs self.assertEqual(maxs[0], expectedMax) print "mins:", mins self.assertEqual(mins[0], expectedMin) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) trial += 1 # if colname!='' and expected[scipyCol]: if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error ? supposedly fixed now # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxErr = 1.05 * maxErr expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ('cars.csv', 'c.hex', [ (None, None,None,None,None,None), ('economy (mpg)', None,None,None,None,None), ('cylinders', None,None,None,None,None), ], ), ('runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ('runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ('runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100,00), ], ), ('runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00))) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * 0.01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] self.assertEqual(colname, expected[0]) coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_percentile2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {} for x in range(expectedMin, expectedMax): legalValues[x] = x write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: e = .1 * rowCount self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) print "pctile:", pctile print "maxs:", maxs self.assertEqual(maxs[0], expectedMax) print "mins:", mins self.assertEqual(mins[0], expectedMin) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) trial += 1 # if colname!='' and expected[scipyCol]: if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxDelta = 1 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'B.hex', 1, 1000 * M, ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append(\ (1000000, 1, 'x.hex', -1, 1, ('C1', -1.0, -1, 0.000, 1, 1.00)) \ ) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_percentile(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 1, 'cD', 300), (100000, 2, 'cE', 300), ] timeoutSecs = 10 trial = 1 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print 'Trial:', trial SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} # set. http://docs.python.org/2/library/stdtypes.html#set expectedMin = min(legalValues) expectedMax = max(legalValues) expectedUnique = (expectedMax - expectedMin) + 1 mode = 0.5 # rounding to nearest int will shift us from this for expected mean expectedMean = 0.5 expectedSigma = 0.5 write_syn_dataset(csvPathname, rowCount, colCount, low=expectedMin, high=expectedMax, mode=mode, SEED=SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename('.', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=hex_key) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hbrk: self.assertIn(int(b), legalValues) self.assertEqual(len(hbrk), len(legalValues)) # self.assertAlmostEqual(hcnt[0], 0.5 * rowCount, delta=.01*rowCount) # self.assertAlmostEqual(hcnt[1], 0.5 * rowCount, delta=.01*rowCount) print "pctile:", pctile print "maxs:", maxs # we round to int, so we may introduce up to 0.5 rounding error? compared to "mode" target self.assertAlmostEqual(maxs[0], expectedMax, delta=0.01) print "mins:", mins self.assertAlmostEqual(mins[0], expectedMin, delta=0.01) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2550.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxDelta = 0.5 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_summary_stepping(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', .4900, .5000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -.5000, -.4900, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 490, 500, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -500, -490, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 49000, 50000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -50000, -49000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 4900, 5000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -5000, -4900, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, rangeMin, rangeMax, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax # add 5% for fp errors? maxErr = ((expectedMax - expectedMin)/1000) * 1.05 expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), # (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), # (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), # (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvFilename # column 0? summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1') h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # default_pctiles # isText # rows # off # key # checksum # only one column columns = summaryResult['frames'][0]['columns'] default_pctiles = summaryResult['frames'][0]['default_pctiles'] co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" pctiles = [0] + co.pctiles + [0] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(co.label, expected[0]) if expected[1]: h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) print "co.label:", co.label, "co.maxs (2 places):", mx print "co.label:", co.label, "co.mins (2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "h2oSummary2MaxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctiles[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ( 'cars.csv', 'c.hex', [ (None, None, None, None, None, None), ('economy (mpg)', None, None, None, None, None), ('cylinders', None, None, None, None, None), ], ), ( 'runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ( 'runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ( 'runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100, 00), ], ), ( 'runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles( source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype != 'Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual( mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual( maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype != 'Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser() == 'kevin': tryList = [ (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range(0, numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i) # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) # print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key) l = levels['levels'] for column in range(iColCount, iColCount + oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1 == 1: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)), (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None, None)), (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)), (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5 * ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5 * ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1 * ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1 * ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1 * ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1 * ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1 * ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]), (1 * ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1 * ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str, probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model(algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][ 0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: h2o.beta_features = False if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) scipyCol += 1 trial += 1
def test_summary2_small(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 100, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 1000, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg= "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?" ) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, numRows / len(hcnt), delta=1 + .01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser()=='kevin': tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range (0,numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i); # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key); l = levels['levels'] for column in range(iColCount, iColCount+oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1==0: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1==0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? # hack? maxErr = maxErr * 2 print "maxErr:", maxErr else: print "Test won't calculate max expected error" maxErr = 0 hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) scipyCol += 1 trial += 1
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else 0.999 q = h2o.nodes[0].quantiles( source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, ) qresult = q["result"] qresult_single = q["result_single"] qresult_iterations = q["iterations"] qresult_interpolated = q["interpolated"] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?", ) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), # (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), # (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), # (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxErr = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxErr = 1.05 * maxErr expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) print co.label, co.type, co.missing, co.domain, sum(co.bins) # default_pctiles # isText # rows # off # key # checksum # touch all that should be there coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for k,v in co: print k, v trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=co.pctiles[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )