def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'datasets' csvPathname = 'logreg' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_GLM_hastie_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'home-0xdiag-datasets' csvPathname = 'standard' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_GLM2_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def test_KMeans_hastie_shuffle_fvec(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard/' + csvFilename bucket = 'home-0xdiag-datasets' kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
def test_GLM2_score_same(self): h2o.beta_features = True # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_B_randomdata2_1_lineend(self): csvPathname = 'datagen1.csv' # change lineend, case 1 csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2, 'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=10, header=0, separator=44) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] h2o_cmd.runRF(parseResult=parseResult, trees=1, response='C' + str(numCols), timeoutSecs=20)
def test_B_randomdata2_1_lineend(self): csvPathname = 'datagen1.csv' # change lineend, case 1 csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2,'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=10, header=0, separator=44) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] h2o_cmd.runRF(parseResult=parseResult, trees=1, response_variable='C'+str(numCols), timeoutSecs=20)
def test_GLM2_princeton(self): # filename, y, timeoutSecs # these are all counts? using gaussian? csvFilenameList = [ ('cuse.dat', 'gaussian', 3, 10), # notUsing ('cuse.dat', 'gaussian', 4, 10), # using ('copen.dat', 'gaussian', 4, 10), ('housing.raw', 'gaussian', 4, 10), ] trial = 0 for (csvFilename, family, y, timeoutSecs) in csvFilenameList: csvPathname1 = 'logreg/princeton/' + csvFilename fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv' h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2) parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs) start = time.time() kwargs = {'n_folds': 0, 'family': family, 'response': y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds' trial += 1 print "\nTrial #", trial
def file_to_put(): # kbn fails 10/15/12 # return 'smalldata/poker/poker-hand-testing.data' a = h2i.find_folder_and_filename('smalldata', 'poker/poker1000', schema='put', returnFullPath=True) print "\nfind_folder_and_filename:", a return a
def test_A_inspect_poker1000(self): csvPathname = "poker/poker1000" res = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') ary = h2o_cmd.runInspect(key=res['destination_key']) # count lines in input file - there is no header for poker 1000 fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) rows = wcl(fullPathname) self.assertEqual(rows, ary['numRows']) self.assertEqual(11, ary['numCols'])
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 1, 'r1', 0, 10, None), ] ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUll, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) print "" print "%30s" % "fpResult:", "%.15f" % fpResult ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # print "%30s" % "hex(bitResult):", hex(ullResult) ullResultList.append((ullResult, fpResult)) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll) print "%30s" % "expectedUll (0.16x):", "0x%0.16x %s" % (expectedUll, expectedUllAsDouble) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_A_inspect_poker1000(self): h2o.beta_features = True csvPathname = "poker/poker1000" res = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') ary = h2o_cmd.runInspect(key=res['destination_key']) # count lines in input file - there is no header for poker 1000 fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) rows = wcl(fullPathname) self.assertEqual(rows, ary['numRows']) self.assertEqual(11, ary['numCols'])
def test_nulls_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # we're going to insert <NUL> (0x0) in between every byte! # and then use it. move to a large file. I suppose # we could compare the results to a non-munged file with the same algo # I suppose the <NUL> are thrown away by parse, so doesn't change # chunk boundary stuff. (i.e. not interesting test for RF) csvFilename = 'poker1000' csvPathname = 'poker/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) nulFilename = "syn_nul.data" nulPathname = SYNDATASETS_DIR + '/' + nulFilename piece_size = 4096 # 4 KiB with open(fullPathname, "rb") as in_file: with open(nulPathname, "wb") as out_file: while True: piece = in_file.read(103) if piece == "": break # end of file # we could just extend piece? # start with a null withNuls = bytearray(piece) # FIX! we'll eventually stick a <NUL> after every byte! withNuls.extend(bytearray.fromhex('00')) out_file.write(withNuls) for trials in xrange(1, 2): trees = 6 for x in xrange(161, 240, 40): y = 10000 * x print "\nTrial:", trials, ", y:", y timeoutSecs = 20 + 5 * (len(h2o.nodes)) model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=nulPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, destination_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) sys.stdout.write('.') sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_file_with_nul_chars_inserted(self): SYNDATASETS_DIR = h2o.make_syn_dir() # we're going to insert <NUL> (0x0) in between every byte! # and then use it. move to a large file. I suppose # we could compare the results to a non-munged file with the same algo # I suppose the <NUL> are thrown away by parse, so doesn't change # chunk boundary stuff. (i.e. not interesting test for RF) csvFilename = "poker1000" csvPathname = "poker/" + csvFilename fullPathname = h2i.find_folder_and_filename("smalldata", csvPathname, returnFullPath=True) nulFilename = "syn_nul.data" nulPathname = SYNDATASETS_DIR + "/" + nulFilename piece_size = 4096 # 4 KiB with open(fullPathname, "rb") as in_file: with open(nulPathname, "wb") as out_file: while True: piece = in_file.read(103) if piece == "": break # end of file # we could just extend piece? # start with a null withNuls = bytearray(piece) # FIX! we'll eventually stick a <NUL> after every byte! withNuls.extend(bytearray.fromhex("00")) out_file.write(withNuls) for trials in xrange(1, 2): trees = 6 for x in xrange(161, 240, 40): y = 10000 * x print "\nTrial:", trials, ", y:", y timeoutSecs = 20 + 5 * (len(h2o.nodes)) model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=nulPathname, schema="put") h2o_cmd.runRF( parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) sys.stdout.write(".") sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:, i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_GLM2grid_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = "home-0xdiag-datasets" csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = "standard" + "/" + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename("home-0xdiag-datasets", csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + "/" + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + "/" + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename('home-0xdiag-datasets', csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:,i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_GLM_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'logreg' + '/' + csvFilename y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)
def test_summary2_small(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 100, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), (None, 1000, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg= "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?" ) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, numRows / len(hcnt), delta=1 + .01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_GLM_both(self): if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) num_cols = inspect['numCols'] num_rows = inspect['numRows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_exec2_xorsum2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(20): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES-1) (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues ALLOWED_BIT_ERR = 0x1f # seeing this amount of error! if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR): raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1 == 1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON } timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % ( trainKey, y + 1, trainKey, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * ( abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [ abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients) ] def printit(self, a, b, c, d): pctDiff = abs(d / c) * 100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len( coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i, cValue in enumerate(coefficients): printit(self, "coefficient", "C" + str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=2.0, msg= "predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult != expectedUllSum: raise Exception( "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)
def test_enums_with_0_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ (' a', ' b', 'NA'), (' a', ' b', '"NA"'), # only one enum? # the NA count has to get flipped if just one enum and 0 (' a', ' b', ''), (' a', ' a', ''), # (' a', 'a', '0'), # doesn't match my "single enum' check above (' a', ' b', ' 0'), # what about mixed NA and 0? doesn't happen? ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ # only one enum? (' a', ' b', ''), (' a', ' b', ''), ('a', 'b', ''), ('a', 'a', ''), # ('a', 'a', '0'), ('a', 'b', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (1000, 5, 'x.hex', choicesList[4], expectedList[4]), (1000, 5, 'x.hex', choicesList[5], expectedList[5]), (1000, 5, 'x.hex', choicesList[6], expectedList[6]), (1000, 5, 'x.hex', choicesList[7], expectedList[7]), (1000, 5, 'x.hex', choicesList[3], expectedList[3]), (1000, 5, 'x.hex', choicesList[2], expectedList[2]), (1000, 5, 'x.hex', choicesList[1], expectedList[1]), (1000, 5, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? print "choices:", choices SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) # force header=0 so the T/F strings don't get deduced to be headers parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column for i in range(colCount): column = summaryResult['summaries'][i] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] # if it's just 0's with a single enum, the enums become NA, so the count is flipped self.assertEqual(nacnt, expectedNaCnt[i], "Column %s Expected %s. nacnt %s incorrect. choices: %s" % (i, expectedNaCnt[i], nacnt, choices)) stats = column['stats'] stattype= stats['type'] self.assertEqual(stattype, 'Enum') # FIX! we should compare mean and sd to expected? cardinality = stats['cardinality'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] # cover the hacky two equal expected values hcnt = column['hcnt'] if expected[0]==expected[1]: self.assertEqual(hbrk, [expected[0]]) hcntTotal = hcnt[0] else: self.assertEqual(hbrk, [expected[0], expected[1]]) hcntTotal = hcnt[0] + hcnt[1] self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i]) self.assertEqual(rowCount, numRows, msg="numRows %s should be %s" % (numRows, rowCount)) trial += 1
def test_summary2_unifiles2(self): SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ # colname, (min, 25th, 50th, 75th, max) ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'), # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None), ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None), ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList: if pathPrefix: csvPathname = pathPrefix + "/" + csvFilename else: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if skipHeader: header = 1 else: header = 0 parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else OTHER_Q q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected pctile = stats['pctile'] # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? # hack? maxErr = maxErr * 2 print "maxErr:", maxErr else: print "Test won't calculate max expected error" maxErr = 0 hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn ## ignore for blank colnames, issues with quoted numbers # covtype is too big to do in scipy if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=skipHeader, # important!! col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else OTHER_Q, h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) scipyCol += 1 trial += 1
def test_summary2_percentile2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {} for x in range(expectedMin, expectedMax): legalValues[x] = x write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1) if h2o.verbose: print "summaryResult:", h2o.dump_json(summaryResult) summaries = summaryResult['summaries'] scipyCol = 0 for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: e = .1 * rowCount self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) print "pctile:", pctile print "maxs:", maxs self.assertEqual(maxs[0], expectedMax) print "mins:", mins self.assertEqual(mins[0], expectedMin) for v in pctile: self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin) trial += 1 # if colname!='' and expected[scipyCol]: if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, ) scipyCol += 1
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ( 'cars.csv', 'c.hex', [ (None, None, None, None, None, None), ('economy (mpg)', None, None, None, None, None), ('cylinders', None, None, None, None, None), ], ), ( 'runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ( 'runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ( 'runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100, 00), ], ), ( 'runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles( source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype != 'Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual( mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual( maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype != 'Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_rf_predict3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey + "=" + hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.7, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5 * ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5 * ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1 * ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1 * ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1 * ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1 * ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1 * ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]), (1 * ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1 * ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str, probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model(algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][ 0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_summary2_NY0(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ # only one enum? # the NA count has to get flipped if just one enum and 0 (' a', ' b', ''), (' a', ' a', ''), # (' a', 'a', '0'), # doesn't match my "single enum' check above (' a', ' b', ' 0'), # what about mixed NA and 0? doesn't happen? ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ # only one enum? ('a', 'b', ''), ('a', 'a', ''), # ('a', 'a', '0'), ('a', 'b', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (1000, 5, 'x.hex', choicesList[4], expectedList[4]), (1000, 5, 'x.hex', choicesList[5], expectedList[5]), (1000, 5, 'x.hex', choicesList[6], expectedList[6]), (1000, 5, 'x.hex', choicesList[7], expectedList[7]), (1000, 5, 'x.hex', choicesList[3], expectedList[3]), (1000, 5, 'x.hex', choicesList[2], expectedList[2]), (1000, 5, 'x.hex', choicesList[1], expectedList[1]), (1000, 5, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) # force header=0 so the T/F strings don't get deduced to be headers parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column for i in range(colCount): column = summaryResult['summaries'][i] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] # if it's just 0's with a single enum, the enums become NA, so the count is flipped self.assertEqual( nacnt, expectedNaCnt[i], "Column %s Expected %s. nacnt %s incorrect. choices: %s" % (i, expectedNaCnt[i], nacnt, choices)) stats = column['stats'] stattype = stats['type'] self.assertEqual(stattype, 'Enum') # FIX! we should compare mean and sd to expected? cardinality = stats['cardinality'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] # cover the hacky two equal expected values hcnt = column['hcnt'] if expected[0] == expected[1]: self.assertEqual(hbrk, [expected[0]]) hcntTotal = hcnt[0] else: self.assertEqual(hbrk, [expected[0], expected[1]]) hcntTotal = hcnt[0] + hcnt[1] self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i]) self.assertEqual(rowCount, numRows, msg="numRows %s should be %s" % (numRows, rowCount)) trial += 1
def test_GLM1_GLM2_train_pred_fvec(self): h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 if 1 == 0: bucket = "home-0xdiag-datasets" csvPathname = "standard/covtype.data" hexKey = "covtype.data.hex" y = 54 if 1 == 1: bucket = "home-0xdiag-datasets" csvPathname = "standard/covtype.shuffled.10pct.data" hexKey = "covtype.shuffled.10pct.data.hex" y = 54 if 1 == 0: bucket = "smalldata" # no header csvPathname = "iris/iris.csv" y = 4 predictHexKey = "predict.hex" predictCsv = "predict.csv" execHexKey = "A.hex" execCsv = "exec.csv" csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvExecPathname = SYNDATASETS_DIR + "/" + execCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema="put", returnFullPath=True) def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, "seconds" h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, "predict.hex") h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col( csvExecPathname, msg="Original, after being exec'ed", skipHeader=True ) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if rowNum1 != rowNum2: raise Exception( "original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2) ) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o != p: msg = ( "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) ) if p == 0.0 and wrong0 == 10: print "Not printing any more predicted=0 mismatches" elif p == 0.0 and wrong0 < 10: print msg if p == 1.0 and wrong1 == 10: print "Not printing any more predicted=1 mismatches" elif p == 1.0 and wrong1 < 10: print msg if p == 0.0: wrong0 += 1 elif p == 1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 16.0: raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong) # ************************************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult["destination_key"] CLASS = 1 # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) # ************************************************************************** # first glm1 h2o.beta_features = False # try ignoring the constant col to see if it makes a diff kwargs = { "lsm_solver": LSM_SOLVER, "standardize": STANDARDIZE, # 'y': 'C' + str(y), "y": "C" + str(y + 1), "family": FAMILY, "n_folds": 1, "max_iter": MAX_ITER, "beta_epsilon": BETA_EPSILON, } if USE_EXEC: # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr = "A.hex=%s" % trainKey h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # class 1=1, all else 0 if FAMILY == "binomial": execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {"destination_key": "A.hex"} else: # since we're not using predict, we can use case_mode/val to get the binomial output class if FAMILY == "binomial": kwargs.update({"case_mode": "=", "case": 1}) aHack = {"destination_key": hexKey} timeoutSecs = 120 kwargs.update({"case_mode": "=", "case": 1}) kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm["GLMModel"]["GLMParams"]["family"] = FAMILY print "glm1 end on ", csvPathname, "took", time.time() - start, "seconds" (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm["GLMModel"]["iterations"] err1 = glm["GLMModel"]["validations"][0]["err"] nullDev1 = glm["GLMModel"]["validations"][0]["nullDev"] resDev1 = glm["GLMModel"]["validations"][0]["resDev"] if FAMILY == "binomial": classErr1 = glm["GLMModel"]["validations"][0]["classErr"] auc1 = glm["GLMModel"]["validations"][0]["auc"] # ************************************************************************** # then glm2 h2o.beta_features = True kwargs = { # 'ignored_cols': 'C29', "standardize": STANDARDIZE, "classification": 1 if FAMILY == "binomial" else 0, # 'response': 'C' + str(y), "response": "C" + str(y + 1), "family": FAMILY, "n_folds": 1, "max_iter": MAX_ITER, "beta_epsilon": BETA_EPSILON, } timeoutSecs = 120 if USE_EXEC: # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr = "B.hex=%s" % trainKey h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # class 1=1, all else 0 if FAMILY == "binomial": execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {"destination_key": "B.hex"} else: # since we're not using predict, we can use case_mode/val to get the binomial output class if FAMILY == "binomial": kwargs.update({"case_mode": "=", "case_val": 1}) bHack = {"destination_key": hexKey} kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, "took", time.time() - start, "seconds" (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # ************************************************************************** modelKey = glm["glm_model"]["_key"] avg_err = glm["glm_model"]["submodels"][0]["validation"]["avg_err"] best_threshold = glm["glm_model"]["submodels"][0]["validation"]["best_threshold"] iteration = glm["glm_model"]["submodels"][0]["iteration"] resDev = glm["glm_model"]["submodels"][0]["validation"]["residual_deviance"] nullDev = glm["glm_model"]["submodels"][0]["validation"]["null_deviance"] if FAMILY == "binomial": auc = glm["glm_model"]["submodels"][0]["validation"]["auc"] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 self.assertAlmostEqual( nullDev, nullDevExpected, delta=2, msg="GLM2 nullDev %s is too different from GLM1 %s" % (nullDev, nullDevExpected), ) iterationExpected = iterations1 self.assertAlmostEqual( iteration, iterationExpected, delta=2, msg="GLM2 iteration %s is too different from GLM1 %s" % (iteration, iterationExpected), ) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approx_equal(coeff0, coeff0Expected, 0.01), msg="GLM2 coefficient 0 %s is too different from GLM1 %s" % (coeff0, coeff0Expected), ) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approx_equal(coeff2, coeff2Expected, 0.01), msg="GLM2 coefficient 2 %s is too different from GLM1 %s" % (coeff2, coeff2Expected), ) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == "binomial": aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg="GLM2 auc %s is too different from GLM1 %s" % (auc, aucExpected) ) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue( h2o_util.approx_equal(intercept, interceptExpected, 0.01), msg="GLM2 intercept %s is too different from GLM1 %s" % (intercept, interceptExpected), ) # avg_errExpected = 0.2463 avg_errExpected = err1 self.assertAlmostEqual( avg_err, avg_errExpected, delta=0.05 * avg_errExpected, msg="GLM2 avg_err %s is too different from GLM1 %s" % (avg_err, avg_errExpected), ) self.assertAlmostEqual( best_threshold, 0.35, delta=0.01 * best_threshold, msg="GLM2 best_threshold %s is too different from GLM1 %s" % (best_threshold, 0.35), ) predict_and_compare_csvs(model_key=modelKey)
def test_mixed_int_enum_many(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ['abc', 'def', 'ghi'] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ''] expectedList = ['abc', 'def', 'ghi'] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nTrial:", trial, csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) for i in range(colCount): column = summaryResult['summaries'][i] colname = column['colname'] coltype = column['type'] stats = column['stats'] stattype = stats['type'] if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( stattype, 'Enum', "trial %s: Expecting summaries/stats/type to be Enum for %s col colname %s" % (trial, i, colname)) # FIX! we should compare mean and sd to expected? # assume enough rows to hit all of the small # of choices if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = stats['cardinality'] self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hbrk, enumChoices) hcnt = column['hcnt'] hcntTotal = sum(hcnt) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual(numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)) nacnt = column['nacnt'] if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt)) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_GLM_both(self): h2o.beta_features = True if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) if h2o.beta_features: num_cols = inspect['numCols'] num_rows = inspect['numRows'] else: num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x if h2o.beta_features: kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } else: kwargs = { 'n_folds': 0, 'y': y, 'x': x, 'family': family, 'alpha': 0, 'lambda': 1e-4, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error ? supposedly fixed now # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxErr = 1.05 * maxErr expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 2, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 100, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 1000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if DO_TRY_SCIPY and colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() print scipyCol, pctile[10] generate_scipy_comparison(csvPathnameFull, col=scipyCol, # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single) h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult) h2i.delete_keys_at_all_nodes()
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1==1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 h2o.beta_features = False CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 h2o.beta_features = True kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON} timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr="B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (trainKey, y+1, trainKey, y+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER-1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER-1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected))/abs(coeff0Expected)) self.assertTrue(h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected))/abs(coeff2Expected)) self.assertTrue(h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual(auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected))/abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [abs(abs(a) - abs(b)) for a,b in zip(coefficients1, coefficients)] def printit(self, a, b, c, d): pctDiff = abs(d/c)*100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len(coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i,cValue in enumerate(coefficients): printit(self , "coefficient", "C"+str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 2.0, msg="predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def do_scipy_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("Now doing sklearn") h2p.red_print("\nsee http://scikit-learn.org/0.11/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression") import numpy as np import scipy as sp from sklearn.linear_model import LogisticRegression from numpy import loadtxt csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) # make sure it does fp divide C = 1/(L+0.0) print "C regularization:", C dataset = np.loadtxt( open(csvPathnameFull,'r'), skiprows=1, # skip the header delimiter=',', dtype='float'); print "\ncsv read for training, done" n_features = len(dataset[0]) - 1; print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array ( [x[2:] for x in dataset] ) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target,3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family!='binomial': raise Exception("Only have binomial logistic for scipy") print "\nTrying l2" clf2 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', tol=0.0001); # train the classifier start = time.time() clf2.fit(train, target) print "L2 fit took", time.time() - start, "seconds" # print "coefficients:", clf2.coef_ cstring = "".join([("%.5e " % c) for c in clf2.coef_[0]]) h2p.green_print("sklearn L2 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf2.intercept_[0]) h2p.green_print("sklearn score:", clf2.score(train,target)) print "\nTrying l1" clf1 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l1', tol=0.0001); # train the classifier start = time.time() clf1.fit(train, target) print "L1 fit took", time.time() - start, "seconds" # print "coefficients:", clf1.coef_ cstring = "".join([("%.5e " % c) for c in clf1.coef_[0]]) h2p.green_print("sklearn L1 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf1.intercept_[0]) h2p.green_print("sklearn score:", clf1.score(train,target)) # attributes are accessed in the normal python way dx = clf1.__dict__ dx.keys()
def test_rf_predict3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data") return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7, msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, ) h2o.nodes[0].remove_all_keys()
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ('cars.csv', 'c.hex', [ (None, None,None,None,None,None), ('economy (mpg)', None,None,None,None,None), ('cylinders', None,None,None,None,None), ], ), ('runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ('runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ('runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100,00), ], ), ('runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_summary2_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2*e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_summary2_uniform_w_NA(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1 + NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1 + NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2 * e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1==0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'): h2p.red_print("Now doing statsmodels") h2p.red_print("http://statsmodels.sourceforge.net/devel/glm.html#module-reference") h2p.red_print("http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html") import numpy as np import scipy as sp from numpy import loadtxt import statsmodels as sm csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if 1==1: dataset = np.loadtxt( open(csvPathnameFull,'r'), skiprows=1, # skip the header delimiter=',', dtype='float'); # skipping cols from the begining... (ID is col 1) # In newer versions of Numpy, np.genfromtxt can take an iterable argument, # so you can wrap the file you're reading in a generator that generates lines, # skipping the first N columns. If your numbers are comma-separated, that's something like if 1==0: f = open(csvPathnameFull,'r'), np.genfromtxt( (",".join(ln.split()[1:]) for ln in f), skiprows=1, # skip the header delimiter=',', dtype='float'); print "\ncsv read for training, done" # data is last column # drop the output n_features = len(dataset[0]) - 1; print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array ( [x[2:] for x in dataset] ) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target,3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family!='gaussian': raise Exception("Only have gaussian logistic for scipy") # train the classifier gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log)) start = time.time() gauss_log_results = gauss_log.fit() print "sm_api.GLM took", time.time() - start, "seconds" print gauss_log_results.summary()
def test_glm_predict3(self): SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 if 1==1: csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist.hex' else: # try smaller data set csvPathname = 'mnist/mnist_testing.csv.gz' hexKey = 'mnist.hex' predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' bucket = 'home-0xdiag-datasets' csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if HAS_HEADER: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key h2e.exec_expr(execExpr="Z.hex="+hex_key+"[0]", timeoutSecs=30) start = time.time() # predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key="P.hex", predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() # depending what you use, may need to set these to 0 or 1 skipSrcOutputHeader = 1 skipPredictHeader= 1 print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o!=p: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) if p==0.0 and wrong0==10: print "Not printing any more predicted=0 mismatches" elif p==0.0 and wrong0<10: print msg if p==1.0 and wrong1==10: print "Not printing any more predicted=1 mismatches" elif p==1.0 and wrong1<10: print msg if p==0.0: wrong0 += 1 elif p==1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # digit 3 with no regularization got around 5%. 0 gets < 2% if pctWrong > 6.0: raise Exception("pct wrong too high. Expect < 6% error") #************************************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" # excessive, trying each digit one at a time, but hey a good test # limit the ierations so it doesn't take so long. 4 minutes per digit? max_iter = 4 # for y in [0,1,2,3,4,5,6,7,8,9]: # for y in [0,3,7]: for case in [0,3,7]: translate = {} for i in range(10): if i == case: translate[i] = 1.0 else: translate[i] = 0.0 print "translate:", translate kwargs = { 'x': "", 'y': 0, 'family': 'binomial', 'link': 'logit', 'n_folds': 1, 'case_mode': '=', 'case': case, # zero should predict to 1, 2-9 should predict to 0 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' # okay for some coefficients to go to zero! h2o_glm.simpleCheckGLM(self, glm, 13, allowZeroCoeff=True, **kwargs) predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' # okay for some coefficients to go to zero! h2o_glm.simpleCheckGLM(self, glm, 13, allowZeroCoeff=True, **kwargs) predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate)
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'): h2p.red_print("Now doing statsmodels") h2p.red_print( "http://statsmodels.sourceforge.net/devel/glm.html#module-reference") h2p.red_print( "http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html" ) import numpy as np import scipy as sp from numpy import loadtxt import statsmodels as sm csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if 1 == 1: dataset = np.loadtxt( open(csvPathnameFull, 'r'), skiprows=1, # skip the header delimiter=',', dtype='float') # skipping cols from the begining... (ID is col 1) # In newer versions of Numpy, np.genfromtxt can take an iterable argument, # so you can wrap the file you're reading in a generator that generates lines, # skipping the first N columns. If your numbers are comma-separated, that's something like if 1 == 0: f = open(csvPathnameFull, 'r'), np.genfromtxt( (",".join(ln.split()[1:]) for ln in f), skiprows=1, # skip the header delimiter=',', dtype='float') print "\ncsv read for training, done" # data is last column # drop the output n_features = len(dataset[0]) - 1 print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array([x[2:] for x in dataset]) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target, 3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family != 'gaussian': raise Exception("Only have gaussian logistic for scipy") # train the classifier gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian( sm_api.families.links.log)) start = time.time() gauss_log_results = gauss_log.fit() print "sm_api.GLM took", time.time() - start, "seconds" print gauss_log_results.summary()
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)), # (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, 0, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else 0.999 q = h2o.nodes[0].quantiles( source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, ) qresult = q["result"] qresult_single = q["result_single"] qresult_iterations = q["iterations"] qresult_interpolated = q["interpolated"] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess( qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?", ) # only one column column = summaryResult["summaries"][0] colname = column["colname"] coltype = column["type"] nacnt = column["nacnt"] stats = column["stats"] stattype = stats["type"] # FIX! we should compare mean and sd to expected? mean = stats["mean"] sd = stats["sd"] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats["zeros"] mins = stats["mins"] maxs = stats["maxs"] pct = stats["pct"] # the thresholds h2o used, should match what we expected expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats["pctile"] print "pctile:", pctile if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected") if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected" ) if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected" ) if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected" ) if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected") hstart = column["hstart"] hstep = column["hstep"] hbrk = column["hbrk"] hcnt = column["hcnt"] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual( b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e) ) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != "": # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=scipyCol, # what col to extract from the csv datatype="float", quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, )
def test_summary2_exp(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)), (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None, None)), (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)), (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )