def test_expr_rpy2(self): for k in range(20): a = random.randint(1,10) # b = random.randint(49,50) b = random.randint(1,10) c = random.randint(0,3) for k in range(50): execExpr = "a=" + str(h2o_eqns.Expression(a, b, c)) + ";" (resultExec, hResult) = h2e.exec_expr(execExpr=execExpr) print "h2o:", hResult rResult = robjects.r(execExpr)[0] print "R:", rResult if math.isinf(rResult): # covers pos/neg inf? if not 'Infinity' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif math.isnan(rResult): if not 'NaN' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif 'Infinity' in str(hResult) or'NaN' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) else: # skip Inf # don't do logicals..h2o 1/0, R True/False h2o_util.assertApproxEqual(rResult, hResult, tol=1e-12, msg='mismatch h2o/R expression result')
def runScore(node=None, dataKey=None, modelKey=None, predictKey='Predict.hex', vactual='C1', vpredict=1, expectedAuc=None, expectedAucTol=0.15, doAUC=True, timeoutSecs=200): # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = runPredict(data_key=dataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # inspect = runInspect(key=dataKey) # print dataKey, dump_json(inspect) # just get a predict and AUC on the same data. has to be binomial result if doAUC: resultAUC = h2o_nodes.nodes[0].generate_auc(thresholds=None, actual=dataKey, predict='Predict.hex', vactual=vactual, vpredict=vpredict) auc = resultAUC['aucdata']['AUC'] if expectedAuc: h2o_util.assertApproxEqual( auc, expectedAuc, tol=expectedAucTol, msg="actual auc: %s not close enough to %s" % (auc, expectedAuc)) # don't do this unless binomial predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix( actual=dataKey, predict=predictKey, vactual=vactual, vpredict='predict', ) # print "cm", dump_json(predictCMResult) # These will move into the h2o_gbm.py # if doAUC=False, means we're not binomial, and the cm is not what we expect if doAUC: cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm) print h2o_gbm.pp_cm(cm) return predictCMResult
def runScore(node=None, dataKey=None, modelKey=None, predictKey='Predict.hex', vactual='C1', vpredict=1, expectedAuc=None, doAUC=True, timeoutSecs=200): # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = runPredict( data_key=dataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # inspect = runInspect(key=dataKey) # print dataKey, dump_json(inspect) # just get a predict and AUC on the same data. has to be binomial result if doAUC: resultAUC = h2o_nodes.nodes[0].generate_auc( thresholds=None, actual=dataKey, predict='Predict.hex', vactual=vactual, vpredict=vpredict) auc = resultAUC['aucdata']['AUC'] if expectedAuc: h2o_util.assertApproxEqual(auc, expectedAuc, tol=0.15, msg="actual auc: %s not close enough to %s" % (auc, expectedAuc)) # don't do this unless binomial predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix( actual=dataKey, predict=predictKey, vactual=vactual, vpredict='predict', ) # print "cm", dump_json(predictCMResult) # These will move into the h2o_gbm.py # if doAUC=False, means we're not binomial, and the cm is not what we expect if doAUC: cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm); print h2o_gbm.pp_cm(cm) return predictCMResult
def compareResultsToExpected(tupleResultList, expected=None, allowedDelta=None, allowError=False, allowRowError=False): # the expected/tupleResultlist should be sorted already by center sum, but just in case... tupleResultList.sort(key=lambda tup: sum(tup[1])) if expected is not None: # sort expected, just in case, for the comparison expected.sort(key=lambda tup: sum(tup[1])) print "\nExpected:" for e in expected: print e # now compare to expected, with some delta allowed print "\nActual:" for t in tupleResultList: print t, "," # so can cut and paste and put results in an expected = [..] list if expected is not None and not allowError: # allowedDelta must exist if expected exists for i, (expCid, expCenter, expRows, expError) in enumerate(expected): (actCid, actCenter, actRows, actError) = tupleResultList[i] for (a, b) in zip(expCenter, actCenter): # compare list of floats absAllowedDelta = abs(allowedDelta[0] * a) absAllowedDelta = max(absAllowedDelta, allowedDelta[0]) # comparing to 0? h2o_util.assertApproxEqual( a, b, tol=absAllowedDelta, msg="Center value expected: %s actual: %s delta > %s" % (a, b, absAllowedDelta)) if not allowRowError and expRows: # allow error in row count? absAllowedDelta = abs(allowedDelta[1] * expRows) absAllowedDelta = max(absAllowedDelta, allowedDelta[1]) # comparing to 0? h2o_util.assertApproxEqual( expRows, actRows, tol=absAllowedDelta, msg="Rows expected: %s actual: %s delta > %s" % (expRows, actRows, absAllowedDelta)) if not allowRowError and expError: # allow error in row count? absAllowedDelta = abs(allowedDelta[2] * expError) absAllowedDelta = max(absAllowedDelta, allowedDelta[2]) # comparing to 0? h2o_util.assertApproxEqual( expRows, actRows, tol=absAllowedDelta, msg="Error expected: %s actual: %s delta > %s" % (expError, actError, absAllowedDelta))
def compareResultsToExpected(tupleResultList, expected=None, allowedDelta=None, allowError=False, allowRowError=False): # the expected/tupleResultlist should be sorted already by center sum, but just in case... tupleResultList.sort(key=lambda tup: sum(tup[1])) if expected is not None: # sort expected, just in case, for the comparison expected.sort(key=lambda tup: sum(tup[1])) print "\nExpected:" for e in expected: print e # now compare to expected, with some delta allowed print "\nActual:" for t in tupleResultList: print t, "," # so can cut and paste and put results in an expected = [..] list if expected is not None and not allowError: # allowedDelta must exist if expected exists for i, (expCid, expCenter, expRows, expError) in enumerate(expected): (actCid, actCenter, actRows, actError) = tupleResultList[i] for (a,b) in zip(expCenter, actCenter): # compare list of floats absAllowedDelta = abs(allowedDelta[0] * a) absAllowedDelta = max(absAllowedDelta, allowedDelta[0]) # comparing to 0? h2o_util.assertApproxEqual(a, b, tol=absAllowedDelta, msg="Center value expected: %s actual: %s delta > %s" % (a, b, absAllowedDelta)) if not allowRowError and expRows: # allow error in row count? absAllowedDelta = abs(allowedDelta[1] * expRows) absAllowedDelta = max(absAllowedDelta, allowedDelta[1]) # comparing to 0? h2o_util.assertApproxEqual(expRows, actRows, tol=absAllowedDelta, msg="Rows expected: %s actual: %s delta > %s" % (expRows, actRows, absAllowedDelta)) if not allowRowError and expError: # allow error in row count? absAllowedDelta = abs(allowedDelta[2] * expError) absAllowedDelta = max(absAllowedDelta, allowedDelta[2]) # comparing to 0? h2o_util.assertApproxEqual(expRows, actRows, tol=absAllowedDelta, msg="Error expected: %s actual: %s delta > %s" % (expError, actError, absAllowedDelta))
def test_summary2_uniform_w_NA(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1 + NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1 + NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2 * e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ('cars.csv', 'c.hex', [ (None, None,None,None,None,None), ('economy (mpg)', None,None,None,None,None), ('cylinders', None,None,None,None,None), ], ), ('runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ('runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ('runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100,00), ], ), ('runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2550.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxDelta = 0.5 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1==0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, interpolate='linear', quantile=0.50): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = false target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP= map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP= map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 if SCIPY_INSTALLED: p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.5, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) h2o_util.assertApproxEqual(h2oSummary2, b, rel=0.5, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesApprox: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) if h2oQuantilesExact: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesExact, s2, rel=0.01, msg='h2o quantile multipass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesExact, s2, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxDelta = 1 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 0, 9.0, False, False), ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20.0, False, False), ("connect4.svm", "cB", 30, -1, 1.0, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False), ("mushrooms.svm", "cG", 30, 1, 2.0, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = float(inspectFirst['cols'][0]['min']) # print h2o.dump_json(inspectFirst['cols'][0]) imax = float(inspectFirst['cols'][0]['max']) if expectedCol0Min: self.assertEqual( imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: h2o_util.assertApproxEqual( imax, expectedCol0Max, tol=0.00000001, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect( inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for hex_keyB # h2o.nodes[0].remove_key(hex_key) start = time.time() hex_keyB = hex_key + "_B" parseResultB = h2o_cmd.parseResult = h2i.import_parse( path=csvDownloadPathname, schema='put', hex_key=hex_keyB) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_keyB) missingValuesListB = h2o_cmd.infoFromInspect( inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i, d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception( "testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True # ignore the variance diffs. reals mismatch when they're not? filtered = [ v for v in df.difference if not 'variance' in v ] self.assertLess(len(filtered), 3, msg="Want < 3, not %d differences between the two rfView json responses. %s" % \ (len(filtered), h2o.dump_json(filtered))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) h2o_util.assertApproxEqual( value_size_bytesA, value_size_bytesB, tol=0.00000001, msg= "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1)**2 h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'B.hex', 1, 1000 * M, ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append(\ (1000000, 1, 'x.hex', -1, 1, ('C1', -1.0, -1, 0.000, 1, 1.00)) \ ) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_uniform_int_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() M = 100 tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)), (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)), (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)), (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)), (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)), # (ROWS, 1, 'A.hex', 1, 101, ('C1', 1.0, 26.00, 51.00, 76.00, 101.0)), # (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -49.0, 0, 49.00, 99)), (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)), (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)), (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)), # (ROWS, 1, 'C.hex', -101, 101, ('C1', -101, -51, -1, 49.0, 100.0)), ] if not DO_REAL: # only 3 integer values! tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00))) timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * 0.01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] self.assertEqual(colname, expected[0]) coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None): # this is some hack code for reading the csv and doing some percentile stuff in scipy # from numpy import loadtxt, genfromtxt, savetxt import numpy as np import scipy as sp dataset = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', # skip_header=1, dtype=None); # guess! print "csv read for training, done" # we're going to strip just the last column for percentile work # used below NUMCLASSES = 10 print "csv read for training, done" # data is last column # drop the output print dataset.shape if len(dataset.shape) > 1: target = [x[col] for x in dataset] else: target = dataset # we may have read it in as a string. coerce to number targetFP = np.array(target, np.float) if 1==0: n_features = len(dataset[0]) - 1; print "n_features:", n_features # get the end # target = [x[-1] for x in dataset] # get the 2nd col print "histogram of target" print target print sp.histogram(target, bins=NUMCLASSES) print target[0] print target[1] thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] print "scipy per:", thresholds from scipy import stats # a = stats.scoreatpercentile(target, per=per) a = stats.mstats.mquantiles(targetFP, prob=thresholds) a2 = ["%.2f" % v for v in a] h2p.red_print("scipy stats.mstats.mquantiles:", a2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') label = '50%' if DO_MEDIAN else '99.9%' h2p.blue_print(label, "from sort:", b) s = a[5 if DO_MEDIAN else 10] h2p.blue_print(label, "from scipy:", s) h2p.blue_print(label, "from h2o summary2:", h2oMedian) h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a] h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", a2)
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), # (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), # (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), # (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvFilename # column 0? summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1') h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # default_pctiles # isText # rows # off # key # checksum # only one column columns = summaryResult['frames'][0]['columns'] default_pctiles = summaryResult['frames'][0]['default_pctiles'] co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" pctiles = [0] + co.pctiles + [0] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(co.label, expected[0]) if expected[1]: h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) print "co.label:", co.label, "co.maxs (2 places):", mx print "co.label:", co.label, "co.mins (2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "h2oSummary2MaxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctiles[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = False try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if use_genfromtxt and SCIPY_INSTALLED: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP = map(int, target) if SCIPY_INSTALLED: # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual(h2oSummary2, b, tol=h2oSummary2MaxErr, msg='h2o summary2 is not approx. same as sort algo (calculated expected max error)') else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual(h2oSummary2, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo (sloppy compare)') if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual(h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1==0: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ( 'cars.csv', 'c.hex', [ (None, None, None, None, None, None), ('economy (mpg)', None, None, None, None, None), ('cylinders', None, None, None, None, None), ], ), ( 'runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ( 'runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ( 'runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100, 00), ], ), ( 'runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles( source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype != 'Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual( mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual( maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype != 'Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)), (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)), (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)), (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)), (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset( csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE ) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != "" and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)), (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None, None)), (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)), (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult['destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult['destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount+1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 h2o_util.assertApproxEqual(oobeTrainPctRight, expectTrainPctRight, rel=.1, msg="OOBE: pct. right for training not close enough %6.2f %6.2f" % (oobeTrainPctRight, expectTrainPctRight)) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) h2o_util.assertApproxEqual(classification_error, 6.0, rel=.2, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 h2o_util.assertApproxEqual(fullScorePctRight, expectScorePctRight, rel=.1, msg="Full: pct. right for scoring not close enough %6.2f %6.2f" % (fullScorePctRight, expectScorePctRight))
def test_impute_with_na(self): h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.5 # NOT ALLOWED TO SET AN ENUM COL? if 1 == 0: # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec? # just one in row 1 for enumCol in enumColList: print "hack: Putting NA in row 0 of col %s" % enumCol execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after exec:", missingValuesList if len(missingValuesList) != len(enumColList): raise Exception( "Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList)) for trial in range(1): # copy the dataset hex_key2 = 'c.hex' execExpr = '%s = %s' % (hex_key2, hex_key) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) imvResult = h2o.nodes[0].insert_missing_values( key=hex_key2, missing_fraction=missing_fraction, seed=SEED) print "imvResult", h2o.dump_json(imvResult) # maybe make the output col a factor column # maybe one of the 0,1 cols too? # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns. # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3) print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before" expectedMissing = missing_fraction * origNumRows # per col enumColList = [49, 50, 51, 52, 53, 54] for e in enumColList: enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e + 1)) inspect = h2o_cmd.runInspect(key=hex_key2) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList # this is an approximation because we can't force an exact # of missing using insert_missing_values if len(missingValuesList) != numCols: raise Exception( "Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList)) for mv in missingValuesList: h2o_util.assertApproxEqual( mv, expectedMissing, rel=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) summaryResult = h2o_cmd.runSummary(key=hex_key2) h2o_cmd.infoFromSummary(summaryResult) # h2o_cmd.infoFromSummary(summaryResult) print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect" print "trial", trial print "expectedMissing:", expectedMissing print "Now get rid of all the missing values, by imputing means. We know all columns should have NAs from above" print "Do the columns in random order" # don't do the enum cols ..impute doesn't support right? if AVOID_BUG: shuffledColList = range(0, 49) # 0 to 48 execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # summaryResult = h2o_cmd.runSummary(key=hex_key2) # h2o_cmd.infoFromSummary(summaryResult) inspect = h2o_cmd.runInspect(key=hex_key2) numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if len(missingValuesList) != 49: raise Exception( "expected missing values in all cols after pruning enum cols: %s" % missingValuesList) else: shuffledColList = range(0, 55) # 0 to 54 origInspect = inspect random.shuffle(shuffledColList) for column in shuffledColList: # get a random set of column. no duplicate. random order? 0 is okay? will be [] groupBy = random.sample(range(55), random.randint(0, 54)) # header names start with 1, not 0. Empty string if [] groupByNames = ",".join( map(lambda x: "C" + str(x + 1), groupBy)) # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap columnName = "C%s" % (column + 1) print "don't use mode if col isn't enum" badChoices = True while badChoices: method = random.choice(["mean", "median", "mode"]) badChoices = column not in enumColList and method == "mode" NEWSEED = random.randint(0, sys.maxint) print "does impute modify the source key?" # we get h2o error (argument exception) if no NAs impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method) print "Now check that there are no missing values" print "FIX! broken..insert missing values doesn't insert NAs in enum cols" inspect = h2o_cmd.runInspect(key=hex_key2) numRows2 = inspect['numRows'] numCols2 = inspect['numCols'] self.assertEqual( numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2)) self.assertEqual( numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2)) # check that the mean didn't change for the col # the enum cols with mode, we'll have to think of something else missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if missingValuesList: raise Exception( "Not expecting any missing values after imputing all cols: %s" % missingValuesList) cols = inspect['cols'] origCols = origInspect['cols'] print "\nFIX! ignoring these errors. have to figure out why." for i, (c, oc) in enumerate(zip(cols, origCols)): # I suppose since we impute to either median or mean, we can't assume the mean stays the same # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true ### h2o_util.assertApproxEqual(c['mean'], oc['mean'], tol=0.000000001, ### msg="col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean'])) if not h2o_util.approxEqual( oc['mean'], c['mean'], tol=0.000000001): msg = "col %i original mean: %s not equal to mean after impute: %s" % ( i, oc['mean'], c['mean']) print msg
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) i = InspectObj(key=key) # just so I don't have to change names below missingList = i.missingList labelList = i.labelList numRows = i.numRows numCols = i.numCols print "labelList:", labelList assert labelList is not None # doesn't take indices? only column labels? # return first column, unless specified if not (column is None or isinstance(column, (basestring, int))): raise Exception( "column param should be string or integer index or None %s %s" % (type(column), column)) # either return the first col, or the col indentified by label. the column identifed could be string or index? if column is None: # means the summary json when we ask for col 0, will be what we return (do all though) colNameToDo = labelList colIndexToDo = range(len(labelList)) elif isinstance(column, int): colNameToDo = [labelList[column]] colIndexToDo = [column] elif isinstance(column, basestring): colNameToDo = [column] if column not in labelList: raise Exception("% not in labellist: %s" % (column, labellist)) colIndexToDo = [labelList.index(column)] else: raise Exception("wrong type %s for column %s" % (type(column), column)) # we get the first column as result after walking across all, if no column parameter desiredResult = None for (colIndex, colName) in zip(colIndexToDo, colNameToDo): print "doing summary on %s %s" % (colIndex, colName) # ugly looking up the colIndex co = SummaryObj(key=key, colIndex=colIndex, colName=colName) if not desiredResult: desiredResult = co if not noPrint: for k, v in co: # only print [0] of mins and maxs because of the e308 values when they don't have dataset values if k == 'mins' or k == 'maxs': print "%s[0]" % k, v[0] else: print k, v if expected is not None: print "len(co.histogram_bins):", len(co.histogram_bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) # print "FIX! hacking the co.percentiles because it's short by two" # if co.percentiles: # percentiles = [0] + co.percentiles + [0] # else: # percentiles = None percentiles = co.percentiles assert len(co.percentiles) == len(co.default_percentiles) # the thresholds h2o used, should match what we expected # expected = [0] * 5 # Fix. doesn't check for expected = 0? # max of one bin if maxDelta is None: maxDelta = (co.maxs[0] - co.mins[0]) / 1000 if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual( percentiles[2], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( percentiles[4], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( percentiles[6], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(percentiles) # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column mx = h2o_util.twoDecimals(co.maxs[0]) mn = h2o_util.twoDecimals(co.mins[0]) print "co.label:", co.label, "co.percentiles (2 places):", pt print "co.default_percentiles:", co.default_percentiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would percentiles be None? enums? if pt is None: compareActual = mn, [None] * 3, mx else: compareActual = mn, pt[2], pt[4], pt[6], mx h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: h2o.beta_features = False if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) scipyCol += 1 trial += 1
def test_summary2_small(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 100, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 1000, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg= "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?" ) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, numRows / len(hcnt), delta=1 + .01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def runSummary(node=None, key=None, expected=None, column=None, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v def __iter__(self): for attr, value in self.__dict__.iteritems(): yield attr, value inspect = runInspect(key=key) # change missingList definition: None if all empty, otherwise align to cols. 0 if 0? missingList, labelList, numRows, numCols = infoFromInspect(inspect) # doesn't take indices? only column labels? lastChecksum = None # return first column, unless specified desiredResult = None for label in labelList: print "doing summary on %s" % label summaryResult = node.summary(key=key, column=label) if not desiredResult or (column and column==label): desiredResult = summaryResult verboseprint("column", column, "summaryResult:", dump_json(summaryResult)) # this should be the same for all the cols? Or does the checksum change? frame = summaryResult['frames'][0] default_pctiles = frame['default_pctiles'] checksum = frame['checksum'] rows = frame['rows'] columns = frame['columns'] # assert len(columns) == numCols assert rows == numRows assert checksum !=0 and checksum is not None assert rows!=0 and rows is not None assert not frame['isText'] # FIX! why is frame['key'] = None here? # assert frame['key'] == key, "%s %s" % (frame['key'], key) # it changes? # assert not lastChecksum or lastChecksum == checksum lastChecksum = checksum # only one column co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] # for c in coList: # print c for k,v in co: print k, v print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" if co.pctiles: pctiles = [0] + co.pctiles + [0] else: pctiles = None # the thresholds h2o used, should match what we expected if expected ==None: expected = [0] * 5 # Fix. doesn't check for expected = 0? if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would pctiles be None? enums? if pt is None: compareActual = mn[0], [None] * 3, mx[0] else: compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [ (1000000, 5, 'cD', 0, 320, 30), ] else: tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 80, 30), # (1000000, 5, 'cD', 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append((1000000, 5, 'cD', 0, 160, 30), ) #tryList.append( # (1000000, 5, 'cD', 0, 320, 30), #) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", ( maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(1): # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect( inspect, csvFilename) self.assertEqual( missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) #***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1)**2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg= "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual( missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg= "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual( missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) print "execResult", h2o.dump_json(execResult) #***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual( result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult))) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs): if not key: raise Exception('No key for Summary') if not node: node = h2o_nodes.nodes[0] # return node.summary(key, **kwargs) i = InspectObj(key=key) # just so I don't have to change names below missingList = i.missingList labelList = i.labelList numRows = i.numRows numCols = i.numCols # doesn't take indices? only column labels? # return first column, unless specified if not (column is None or isinstance(column, (basestring, int))): raise Exception("column param should be string or integer index or None %s %s" % (type(column), column)) # either return the first col, or the col indentified by label. the column identifed could be string or index? if column is None: # means the summary json when we ask for col 0, will be what we return (do all though) colNameToDo = labelList colIndexToDo = range(len(labelList)) elif isinstance(column, int): colNameToDo = [labelList[column]] colIndexToDo = [column] elif isinstance(column, basestring): colNameToDo = [column] colIndexToDo = [labelList.index[column]] else: raise Exception("wrong type %s for column %s" % (type(column), column)) # we get the first column as result after walking across all, if no column parameter desiredResult = None for (colIndex, colName) in zip(colIndexToDo, colNameToDo): print "doing summary on %s %s" % (colIndex, colName) # ugly looking up the colIndex co = SummaryObj(key=key, colIndex=colIndex, colName=colName) if not desiredResult: desiredResult = co if not noPrint: for k,v in co: # only print [0] of mins and maxs because of the e308 values when they don't have dataset values if k=='mins' or k=='maxs': print "%s[0]" % k, v[0] else: print k, v if expected is not None: print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" if co.pctiles: pctiles = [0] + co.pctiles + [0] else: pctiles = None # the thresholds h2o used, should match what we expected # expected = [0] * 5 # Fix. doesn't check for expected = 0? # max of one bin if maxDelta is None: maxDelta = (co.maxs[0] - co.mins[0])/1000 if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, msg='min is not approx. expected') if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort MAX_QBINS = 1000 if expected[0] and expected[4]: expectedRange = expected[4] - expected[0] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column mx = h2o_util.twoDecimals(co.maxs[0]) mn = h2o_util.twoDecimals(co.mins[0]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "co.default_pctiles:", co.default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! why would pctiles be None? enums? if pt is None: compareActual = mn, [None] * 3, mx else: compareActual = mn, pt[3], pt[5], pt[7], mx h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected) return desiredResult
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if not SCIPY_INSTALLED: return if use_genfromtxt: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt(open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype == 'float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype == 'int': targetFP = map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile * 100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile * 100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1 == 0: # type 6 alphap = 0 betap = 0 # type 5 okay but not perfect alphap = 0.5 betap = 0.5 # type 8 alphap = 1 / 3.0 betap = 1 / 3.0 if interpolate == 'mean': # an approx? (was good when comparing to h2o type 2) alphap = 0.4 betap = 0.4 if interpolate == 'linear': # this is type 7 alphap = 1 betap = 1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual( h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual( h2oSummary2, b, tol=h2oSummary2MaxErr, msg= 'h2o summary2 is not approx. same as sort algo (calculated expected max error)' ) else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual( h2oSummary2, b, rel=1.0, msg= 'h2o summary2 is not approx. same as sort algo (sloppy compare)' ) if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual( h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual( h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual( h2oQuantilesExact, s1, tol=0.0000002, msg= 'h2o quantile multipass is not same as scipy stats.scoreatpercentile' ) # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1 == 0: if interpolate == 'mean': h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles' ) else: h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not same as scipy stats.mstats.mquantiles' ) # see if scipy changes. nope. it doesn't if 1 == 0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [(1000000, 5, "cD", 0, 320, 30)] else: tryList = [ # (1000000, 5, 'cD', 0, 10, 30), # (1000000, 5, 'cD', 0, 20, 30), # (1000000, 5, 'cD', 0, 40, 30), # (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, "cD", 0, 80, 30), (1000000, 5, "cD", 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append((1000000, 5, "cD", 0, 160, 30)) tryList.append((1000000, 5, "cD", 0, 320, 30)) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(5): # PARSE train**************************************** hexKey = "r.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="local", hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename) self.assertEqual( missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList ) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # ***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult["num_rows"] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt), ) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual( missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial) ) # ***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult["num_rows"] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt), ) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual( missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial) ) # ***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) print "execResult", h2o.dump_json(execResult) # ***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual( result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult)), ) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = "groups" eLabel = "ddplyElapsed" fLabel = "ddplyElapsed" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 2, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 100, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 1000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if DO_TRY_SCIPY and colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() print scipyCol, pctile[10] generate_scipy_comparison(csvPathnameFull, col=scipyCol, # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single) h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult) h2i.delete_keys_at_all_nodes()
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? # hack? maxErr = maxErr * 2 print "maxErr:", maxErr else: print "Test won't calculate max expected error" maxErr = 0 hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) scipyCol += 1 trial += 1
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult['destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult['destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount+1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 h2o_util.assertApproxEqual(oobeTrainPctRight, expectTrainPctRight, rel=.1, msg="OOBE: pct. right for training not close enough %6.2f %6.2f" % (oobeTrainPctRight, expectTrainPctRight)) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) h2o_util.assertApproxEqual(classification_error, 6.0, tol=3, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 h2o_util.assertApproxEqual(fullScorePctRight, expectScorePctRight, rel=.1, msg="Full: pct. right for scoring not close enough %6.2f %6.2f" % (fullScorePctRight, expectScorePctRight))
def test_summary2_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2*e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 0, 9.0, False, False), ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20.0, False, False), ("connect4.svm", "cB", 30, -1, 1.0, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False), ("mushrooms.svm", "cG", 30, 1, 2.0, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = float(inspectFirst['cols'][0]['min']) # print h2o.dump_json(inspectFirst['cols'][0]) imax = float(inspectFirst['cols'][0]['max']) if expectedCol0Min: self.assertEqual(imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: h2o_util.assertApproxEqual(imax, expectedCol0Max, tol=0.00000001, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect(inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for hex_keyB # h2o.nodes[0].remove_key(hex_key) start = time.time() hex_keyB = hex_key + "_B" parseResultB = h2o_cmd.parseResult = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_keyB) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_keyB) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i,d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception ("testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True # ignore the variance diffs. reals mismatch when they're not? filtered = [v for v in df.difference if not 'variance' in v] self.assertLess(len(filtered), 3, msg="Want < 3, not %d differences between the two rfView json responses. %s" % \ (len(filtered), h2o.dump_json(filtered))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual(row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) h2o_util.assertApproxEqual(value_size_bytesA, value_size_bytesB, tol=0.00000001, msg="value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual(missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual(num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt-minInt)+1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (5000000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (5000000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (1000000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (1000000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (1000000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (1000000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (1000000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (1000000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (1000000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (1000000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, trial in enumerate(thresholds): execExpr = "quantile(%s[,1], c(%s));" % (hex_key, trial) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) ex = twoDecimals(result) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (trial, ex, pt[i])) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='percentile: % is not expected: %s' % (result, pctile[i])) if DO_TRY_SCIPY: generate_scipy_comparison(csvPathnameFull)
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else 0.999 q = h2o.nodes[0].quantiles( source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, ) qresult = q["result"] qresult_single = q["result_single"] qresult_iterations = q["iterations"] qresult_interpolated = q["interpolated"] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?", ) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False, **kwargs): if not node: node = h2o_nodes.nodes[0] if 'warnings' in rfv: warnings = rfv['warnings'] # catch the 'Failed to converge" for now for w in warnings: if not noPrint: print "\nwarning:", w if ('Failed' in w) or ('failed' in w): raise Exception(w) #**************************** # if we are checking after confusion_matrix for predict, the jsonschema is different if 'cm' in rfv: cm = rfv['cm'] # only one else: if 'drf_model' in rfv: rf_model = rfv['drf_model'] elif 'speedrf_model' in rfv: rf_model = rfv['speedrf_model'] elif 'rf_model' in rfv: rf_model = rfv['rf_model'] else: raise Exception("no rf_model in rfv? %s" % dump_json(rfv)) cms = rf_model['cms'] print "number of cms:", len(cms) print "FIX! need to add reporting of h2o's _perr per class error" # FIX! what if regression. is rf only classification? print "cms[-1]['_arr']:", cms[-1]['_arr'] print "cms[-1]['_predErr']:", cms[-1]['_predErr'] print "cms[-1]['_classErr']:", cms[-1]['_classErr'] ## print "cms[-1]:", dump_json(cms[-1]) ## for i,c in enumerate(cms): ## print "cm %s: %s" % (i, c['_arr']) cm = cms[-1]['_arr'] # take the last one scoresList = cm if not checkScoringOnly: used_trees = rf_model['N'] errs = rf_model['errs'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs # if we got the ntree for comparison. Not always there in kwargs though! param_ntrees = kwargs.get('ntrees', None) if (param_ntrees is not None and used_trees != param_ntrees): raise Exception("used_trees should == param_ntree. used_trees: %s" % used_trees) if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs): raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees)) #**************************** totalScores = 0 totalRight = 0 # individual scores can be all 0 if nothing for that output class # due to sampling classErrorPctList = [] predictedClassDict = {} # may be missing some? so need a dict? for classIndex,s in enumerate(scoresList): classSum = sum(s) if classSum == 0 : # why would the number of scores for a class be 0? does RF CM have entries for non-existent classes # in a range??..in any case, tolerate. (it shows up in test.py on poker100) if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?" else: # H2O should really give me this since it's in the browser, but it doesn't classRightPct = ((s[classIndex] + 0.0)/classSum) * 100 totalRight += s[classIndex] classErrorPct = round(100 - classRightPct, 2) classErrorPctList.append(classErrorPct) ### print "s:", s, "classIndex:", classIndex if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct # gather info for prediction summary for pIndex,p in enumerate(s): if pIndex not in predictedClassDict: predictedClassDict[pIndex] = p else: predictedClassDict[pIndex] += p totalScores += classSum #**************************** if not noPrint: print "Predicted summary:" # FIX! Not sure why we weren't working with a list..hack with dict for now for predictedClass,p in predictedClassDict.items(): print str(predictedClass)+":", p # this should equal the num rows in the dataset if full scoring? (minus any NAs) print "totalScores:", totalScores print "totalRight:", totalRight if totalScores != 0: pctRight = 100.0 * totalRight/totalScores else: pctRight = 0.0 pctWrong = 100 - pctRight print "pctRight:", "%5.2f" % pctRight print "pctWrong:", "%5.2f" % pctWrong if checkScoringOnly: check_sandbox_for_errors() classification_error = pctWrong return (round(classification_error,2), classErrorPctList, totalScores) # it's legal to get 0's for oobe error # if sample_rate = 1 sample_rate = kwargs.get('sample_rate', None) validation = kwargs.get('validation', None) print "kevin:", sample_rate, validation if (sample_rate==1 and not validation): pass elif (totalScores<=0 or totalScores>5e9): raise Exception("scores in RFView seems wrong. scores:", scoresList) varimp = rf_model['varimp'] if 'importance' in kwargs and kwargs['importance']: max_var = varimp['max_var'] variables = varimp['variables'] varimpSD = varimp['varimpSD'] varimp2 = varimp['varimp'] # what is max_var? it's 100 while the length of the others is 54 for covtype if not max_var: raise Exception("varimp.max_var is None? %s" % max_var) # if not variables: # raise Exception("varimp.variables is None? %s" % variables) if not varimpSD: raise Exception("varimp.varimpSD is None? %s" % varimpSD) if not varimp2: raise Exception("varimp.varimp is None? %s" % varimp2) # check that they all have the same length and that the importance is not all zero # if len(varimpSD)!=max_var or len(varimp2)!=max_var or len(variables)!=max_var: # raise Exception("varimp lists seem to be wrong length: %s %s %s" % \ # (max_var, len(varimpSD), len(varimp2), len(variables))) # not checking maxvar or variables. Don't know what they should be if len(varimpSD) != len(varimp2): raise Exception("varimp lists seem to be wrong length: %s %s" % \ (len(varimpSD), len(varimp2))) h2o_util.assertApproxEqual(sum(varimp2), 0.0, tol=1e-5, msg="Shouldn't have all 0's in varimp %s" % varimp2) treeStats = rf_model['treeStats'] if not treeStats: raise Exception("treeStats not right?: %s" % dump_json(treeStats)) # print "json:", dump_json(rfv) data_key = rf_model['_dataKey'] model_key = rf_model['_key'] classification_error = pctWrong if not noPrint: if 'minLeaves' not in treeStats or not treeStats['minLeaves']: raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats)) print """ Leaves: {0} / {1} / {2} Depth: {3} / {4} / {5} Err: {6:0.2f} % """.format( treeStats['minLeaves'], treeStats['meanLeaves'], treeStats['maxLeaves'], treeStats['minDepth'], treeStats['meanDepth'], treeStats['maxDepth'], classification_error, ) ### modelInspect = node.inspect(model_key) dataInspect = h2o_cmd.runInspect(key=data_key) check_sandbox_for_errors() return (round(classification_error,2), classErrorPctList, totalScores)