def test_rapids_ifelse_nested(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(2): for execObj, expected in zip(objList, resultList): freshObj = copy(execObj) result = freshObj.do() # do some scalar result checking if expected is not None: # result is a string now?? print "result:", result print "expected:", expected # assert result==expected, "%s %s" (result,expected) # rows might be zero! print "freshObj:", dump_json(freshObj.execResult) if 'key' in freshObj.execResult and freshObj.execResult['key']: keys.append(freshObj.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def test_xl_ast_assert_Z(self): a = DF('a1') checkAst(astForInit(a)) b = DF('b1') checkAst(astForInit(b)) c = DF('c1') checkAst(astForInit(c)) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) # this just overwrite the a/b/c with python datatypes if 1==0: a = 0 checkAst("(= !a1 #0)") b = 0 checkAst("(= !b1 #0)") c = 0 checkAst("(= !c1 #0)") a = [0] checkAst("(= !a1 (c {#0}))") b = [0,1] checkAst("(= !b1 (c {#0;#1}))") c = [0,1,2] checkAst("(= !c1 (c {#0;#1;#2}))") a = (0,) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") b = (0,1) checkAst("(= !b1 (c {#0;#1}))") c = (0,1,2) checkAst("(= !c1 (c {#0;#1;#2}))") # added to init the keys, to avoid AAIOBE at h2o a <<= [0] # comma isn't needed checkAst("(= !a1 (c {#0}))") b <<= [0,1] checkAst("(= !b1 (c {#0;#1}))") c <<= [0,1,2] checkAst("(= !c1 (c {#0;#1;#2}))") # these don't work if 1==0: c = a[0] + b[1] # no .do() needed because of types on rhs? or ? c.do() checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") c[0] = a[0] + b[1] c.do() checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_rapids_funs_basic(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(100): for execExpr in funsList: funs = '[%s]' % execExpr execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4) execExpr2 = '(apply %r1 #2 %anon)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=4) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v2 (+ %v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def tryThemAll(self, set, rows, enumsOnly=False): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first if enumsOnly: tcd = self.tokenChangeDict else: tcd = self.tokenChangeDictEnumsOnly for tokenCase in range(len(tcd)): newRows1 = self.changeTokens(rows, tokenCase, tcd) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in tcd[tokenCase][0]: singleQuotes = 1 else: singleQuotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='local', singleQuotes=singleQuotes, noPrint=not h2o_args.verbose, retryDelaySecs=0.1, doSummary=DO_SUMMARY, intermediateResults=DO_INTERMEDIATE_RESULTS) if DO_RF: h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True) verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_exec2_reduction(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser()=='jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_100000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) for execExpr in initList: result = execExpr.do(timeoutSecs=30) for execExpr in exprList: start = time.time() result = execExpr.do(timeoutSecs=30) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors()
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(100e6),int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_funs_basic2(self): if 1 == 1: bucket = "smalldata" csvPathname = "iris/iris_wheader.csv" else: bucket = "home-0xdiag-datasets" csvPathname = "standard/covtype.data" hexKey = "r1" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) keys = [] for trial in range(5): for execExpr in funsList: funs = "[%s]" % execExpr execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4) execExpr2 = "(= !junk (apply %r1 #2 %anon))" execResult, result = h2e.exec_expr( h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15 ) # rows might be zero! if execResult["num_rows"] or execResult["num_cols"]: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def doAll(case): keys = [] trial = 0 for execExpr in exprList: # 4x4 cases per expression colons = [ # requires only 1 value on rhs '#0 #0', # '"null" #0', # '#0 "null"', # '"null" "null"', ] for colon in colons: # what if the destination doesn't exist?. Use unique name for each, to see t = "t%s" % trial cases = [ # no colon '(= !{} {})'.format(t, execExpr), # colon lhs # '(= ([ %%s %s) %s)' % (t, colon, execExpr), # colon rhs # '(= !%s ([ %s %s))' % (t, execExpr, colon), # colon lhs and rhs '(= ([ %{} {}) ([ {} {}))'.format(t, colon, execExpr, colon), ] for case in cases: # init the data frame first to 0 (1 row, 1 col) print "\nt:", t, "case:", case # can't init it to empty '(= !%s (c {#0})' % t execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4) # colonize it, to see if it blows up! # since they all are assigns, they all are wrapped by '(= !<lhs> ...) # unwrap the inner and wrap it with a colon then wrap it with the assign # change the lhs to be coloned (row and/or col) and change the rhs to be a colon # so four cases # make sure the lhs assign key exists first execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4) # rows/cols could be zero # if execResult['num_rows'] or execResult['num_cols']: # I think if key is not null, then that means a key got created # oh, but exec deletes ones with leading "_" immediately? those are temp keys # we'll put them in the list and see if we see them if execResult['key']: keys.append(execExpr) trial += 1 print "\nExpressions that created keys" for k in keys: print k if re.match('_', k): raise Exception("%s I didn't expect any keys with leading underscores." + "\nDoesn't spencer delete those so I can't read them?" % k) h2o.check_sandbox_for_errors()
def test_xl_seq_A(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key b = DF('b1') c = DF('c1') print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) # look at our secret stash in the base class. Should see the DFInit? # DF does a kv store init. Key doesn't # DF inherits from Key. KeyIndexed inherits from Key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) Assign(a, 0) checkAst("(= !a1 #0)") Assign(b, 0) checkAst("(= !b1 #0)") Assign(c, 0) checkAst("(= !c1 #0)") Assign(a, [0]) checkAst("(= !a1 (c {#0}))") Assign(b, [0,1]) checkAst("(= !b1 (c {#0;#1}))") Assign(c, [0,1,2]) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(a, (0,)) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") Assign(b, (0,1)) checkAst("(= !b1 (c {#0;#1}))") Assign(c, (0,1,2)) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(c, a[0] + b[1]) checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") Assign(c[0], (a[0] + b[1])) checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_rapids_ddply_with_funs(self): if 1==0: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' else: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # get rid of the enum response cole execExpr2 = '(= !r2 ([ %r1 "null" {#0;#1;#2;#3}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15) keys = [] for execExpr1 in initList: # ddply function can only return one row. Just use expressions above as nose # some of the expressions above use %v, but v won't be created as key outside any more with ddply funs = "[(def anon {v} " + "{};;(sum %v %TRUE);;;)]".format(execExpr1) execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5) execExpr2 = '(= !a h2o.ddply %r2 {#2;#3} %anon)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr1) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_basic_with_funs_pick5(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] while initList: if len(initList) >= 5: pick5 = [initList.pop(0) for i in range(5)] else: pick5 = initList global initList initList = [] pick6 = ['(= !v (c {#1;#4567;(: #9 #90);(: #9 #45);#450})'] + pick5 execExpr1 = ";;".join(pick6) # always do a v assign first, as they may reference %v funs = '[(def anon {x} (%s);;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5) execExpr2 = '(apply %r1 #2 %anon)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=25) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr1) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1) ), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1==0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
def test_xl_ast_assert_Y(self): a = DF('a1') checkAst(astForInit(a)) b = DF('b1') checkAst(astForInit(b)) c = DF('c1') checkAst(astForInit(c)) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) a <<= 0 checkAst("(= !a1 #0)") b <<= 0 checkAst("(= !b1 #0)") c <<= 0 checkAst("(= !c1 #0)") a <<= [0] checkAst("(= !a1 (c {#0}))") b <<= [0,1] checkAst("(= !b1 (c {#0;#1}))") c <<= [0,1,2] checkAst("(= !c1 (c {#0;#1;#2}))") a <<= (0,) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") b <<= (0,1) checkAst("(= !b1 (c {#0;#1}))") c <<= (0,1,2) checkAst("(= !c1 (c {#0;#1;#2}))") c <<= a[0] + b[1] checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") c[0] <<= a[0] + b[1] checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_rapids_basic(self): bucket = "smalldata" csvPathname = "iris/iris_wheader.csv" hexKey = "v" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) keys = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4) # rows might be zero! if execResult["num_rows"] or execResult["num_cols"]: keys.append(execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_funs_basic3(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # works for 1 pass..why is execExpr set for 2nd pass? should be new instance? # if we reuse the same object in the list, it has state? # do we need to copy the object...hmm for trial in range(1): for execObj in funsList: freshObj = copy(execObj) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)), do=False) else: a = Assign('junk', Fcn('anon', 'r1'), do=False) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_funs_1000_stmnt(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(3): for execObj in funsList: freshObj = copy(execObj) print "ast length:", len(str(freshObj)) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0))) else: a = Assign('junk', Fcn('anon', 'r1')) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_xl_oobe(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key b = DF('b1') c = DF('c1') # look at our secret stash in the base class. Should see the DFInit? assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) Assign(a, range(5)) Assign(b, range(5)) Assign(c, range(5)) print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) # print "Referring to non-existent rows causes a problem (AAIOBE)" # not any more..change it to legal case Assign(c[1], (a[2] + b[2])) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))" assert ast==astExpected, "Actual: %s Expected: %s" % (ast, astExpected) # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_parse_time(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = None colCount = COLS # rowCount = 1000 rowCount = ROWS write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (20): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) # src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "A trial #", trial # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(key=pA.parse_key, csvPathname=csvDownloadPathname) # do a little testing of saving the key as a csv # remove the original parsed key. source was already removed by h2o if 1==0: h2o.nodes[0].remove_key(pA.parse_key) # interesting. what happens when we do csv download with time data? parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key) print "B trial #", trial pB = h2o_cmd.ParseObj(parseResultB, expectedNumRows=rowCount, expectedNumCols=colCount) print pB.numRows print pB.numCols print pB.parse_key iB = h2o_cmd.InspectObj(pB.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) # these checks are redundant now self.assertEqual(iA.missingList, iB.missingList, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(iA.numCols, iB.numCols, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # so I guess that's okay. So allow for an extra row here. self.assertEqual(iA.numRows, iB.numRows, "pA.numRows: %s pB.numRows: %s mismatch after re-parse of downloadCsv result" % \ (iA.numRows, iB.numRows) ) print "H2O writes the internal format (number) out for time." # ==> syn_time.csv <== # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # ==> csvDownload.csv <== # "0","1","2","3","4","5" # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12 h2o.check_sandbox_for_errors()
def tearDown(self): h2o.nodes[0].log_download() h2o.check_sandbox_for_errors()
def test_rapids_cbind_vec(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 10 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): for trial in [int(10e6)]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # cols = 100 xList = [] eList = [] fList = [] for trial2 in range(0, 5): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): col = 2 ** trial2 # assert col < 16384, "h2o can't take col == 16384 or more" vString = ' '.join(['%v' for x in range(col)]) execExpr = '(= !v2 (cbind %s))' % vString # FIX! check the colnames. 2 cols get C1 and C10? odd # try: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # except: # elapsed2 = 0 # h2p.red_print("ERROR: col = %s failed" % col) if 1==0: start = time.time() execExpr = '(sum %v2 %TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(col) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'col' eLabel = 'elapsed (sum)' fLabel = 'elapsed (cbind cols)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print( "Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print( "No checking of logs while sleeping, or check of cloud status") h2p.yellow_print( "So if H2O stack traces, it's up to you to kill me if 4 hours is too long" ) h2p.yellow_print( "ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown..." ) while (totalTime < maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str( datetime.datetime.now() ), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1 == 0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def tearDown(self): h2o.check_sandbox_for_errors()
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i == 0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v2 (+ %v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1 == 0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect( inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_rbind(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): ROWS = int(100e6) for trial in [ROWS]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList = [] eList = [] fList = [] # gets out of memory error if we rbind too much for trial2 in range(1, 8, 2): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): rows = ROWS * trial2 vString = ' '.join(['%v' for x in range(trial2)]) execExpr = '(= !v2 (rbind %s))' % vString start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) if 1==1: start = time.time() execExpr = '(sum %v2 %TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(rows) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'rows' eLabel = 'elapsed (sum)' fLabel = 'elapsed (rbind)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, check_header="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_columns': "['ID','CAPSULE']"} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_rapids_vec_fail(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(8e6),int(1e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # change it to all 1s? v = v==0 execExpr = '(= !vint (N %vreal #0))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # recursively expand execExpr = '(= !v2 (+ %vint <patt>))' for j in range(3): execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr) # last one execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed2 = time.time() - start execExpr = '(= !v1 (+ %vreal %vreal))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start inspectResult = h2o_cmd.runInspect(key='vreal') h2o_cmd.infoFromInspect(inspectResult) inspectResult = h2o_cmd.runInspect(key='vint') h2o_cmd.infoFromInspect(inspectResult) summaryResult = h2o_cmd.runSummary(key='vreal') if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (v1 = vint + vint)' fLabel = 'elapsed (v2 = vreal + vreal)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_xl_ast_assert_X(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') checkAst(astForInit(a)) b = DF('b1') checkAst(astForInit(b)) c = DF('c1') checkAst(astForInit(c)) # look at our secret stash in the base class. Should see the DFInit? print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) # DF does a kv store init. Key doesn't # DF inherits from Key. KeyIndexed inherits from Key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) Assign(a, 2) checkAst("(= !a1 #2)") Assign(b, 2) checkAst("(= !b1 #2)") Assign(c, 2) checkAst("(= !c1 #2)") # - doesn't exist? multiply by -1? Assign(c, ~c) checkAst("(= !c1 (^ %c1 #1))") # not right if more than 1 col? Assign(c, -c) checkAst("(= !c1 (_ %c1))") Assign(c, abs(c)) checkAst("(= !c1 (abs %c1))") # this needs to be an h2o int? because it expects int return # Assign(c, int(c)) # checkAst("(= !c1 (trunc c1 ))") Assign(a, [0]) checkAst("(= !a1 (c {#0}))") Assign(b, [0,1]) checkAst("(= !b1 (c {#0;#1}))") Assign(c, [0,1,2]) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(a, (0,)) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") Assign(b, (0,1)) checkAst("(= !b1 (c {#0;#1}))") Assign(c, (0,1,2)) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(c, a[0] + b[1]) checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") Assign(c[0], (a[0] + b[1])) checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_xl_basic(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexDF = 'v' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexDF) # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) # look at our secret stash in the base class. Should see the DFInit? print "Does the lastExecResult stash work?", dump_json(h2o_xl.Xbase.lastExecResult) # this should work if str(DF) returns DF.frame inspect = h2o_cmd.runInspect(key=a) # print "inspect a", dump_json(inspect) b = DF('b1') assert isinstance(b, DF) inspect = h2o_cmd.runInspect(key=b) # print "inspect b", dump_json(inspect) Assign(a, [0,0,0]) assert isinstance(a, Key) b <<= [0,0,0] assert isinstance(b, Key) # FIX! how come I have to create c here first for python # see here # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python # is it too much to require c to exist first? # c = DF() # c <<= a + b # this will trigger ok? c = DF('c1') c <<= [0,0,0] assert isinstance(c, Key) # c[0] <<= a + b # Assign(lhs=c[0], rhs=(a + b)) rhs = a + b Assign(c, rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= !c1 (+ %a1 %b1))" assert ast==astExpected, "Actual: %s Expected: %s" % (ast, astExpected) rhs = a[0] + b[0] Assign(c[0], rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))" assert ast==astExpected, "Actual: %s Expected: %s" % (ast, astExpected) Assign(c[1], (a[2] + b[2])) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))" assert ast==astExpected, "Actual: %s Expected: %s" % (ast, astExpected) # assert ast = "(= !b1 (is.na (c {#0})))" assert isinstance(c, Key), type(c) inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) # DF inits the frame # if you just want an existing Key, say existing=True a = DF('a2') # named data frame assert isinstance(a, DF) b = DF('b2') c = DF('c2') inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) a <<= 3 b <<= 3 c <<= 3 c[0] <<= a[0] + b[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) a = DF('a3') # named data frame b = DF('b3') c = DF('c3') a <<= 4 b <<= 4 c <<= 4 c[0] <<= a[0] - b[0] assert isinstance(c, Key) c[0] <<= a[0] * b[0] assert isinstance(c, Key) a = DF('a4') # named data frame b = DF('b4') c = DF('c4') a <<= 5 b <<= 5 c <<= 5 c[0] <<= (a[0] - b[0]) assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) c[0] <<= (a[0] & b[0]) | a[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()