def tryThemAll(self, set, rows, enumsOnly=False): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first if enumsOnly: tcd = self.tokenChangeDict else: tcd = self.tokenChangeDictEnumsOnly for tokenCase in range(len(tcd)): newRows1 = self.changeTokens(rows, tokenCase, tcd) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in self.tokenChangeDict[tokenCase]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, noPrint=not h2o.verbose) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = "1,0,65,1,2,1,1.4,0,6" write_syn_dataset(csvPathname, 99860, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and key2 names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) ### start = time.time() # this was useful to cause failures early on. Not needed eventually ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv")) ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds' start = time.time() key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2) print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=key2) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_rapids_ifelse_nested(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(2): for execObj, expected in zip(objList, resultList): freshObj = copy(execObj) result = freshObj.do() # do some scalar result checking if expected is not None: # result is a string now?? print "result:", result print "expected:", expected assert float(result)==expected, "%s %s" (result,expected) # rows might be zero! print "freshObj:", dump_json(freshObj.execResult) if 'key' in freshObj.execResult and freshObj.execResult['key']: keys.append(freshObj.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def exec_list(exprList, lenNodes, csvFilename, key2): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0,lenNodes-1) colX = random.randint(1,54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1,400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result"+str(trial)+".hex", timeoutSecs=60) eri0 = execResultInspect[0] eri1 = execResultInspect[1] columns = eri0.pop('cols') columnsDict = columns[0] print "\nexecResult columns[0]:", h2o.dump_json(columnsDict) print "\nexecResult [0]:", h2o.dump_json(eri0) print "\nexecResult [1] :", h2o.dump_json(eri1) min = columnsDict["min"] h2o.verboseprint("min: ", min, "trial:", trial) ### self.assertEqual(float(min), float(trial),"what can we check here") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def test_exec2_cbind_like_R(self): SYNDATASETS_DIR = h2o.make_syn_dir() SEEDPERFILE = random.randint(0, sys.maxint) rowCount = 30000 colCount = 150 timeoutSecs = 60 hex_key = "df" csvPathname = SYNDATASETS_DIR + "/" + "df.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) colCount = 1 hex_key = "indx" csvPathname = SYNDATASETS_DIR + "/" + "indx.csv" write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] for trial in range(10): for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' h2o.check_sandbox_for_errors()
def test_exec2_covtype_cols(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=30) print "\nParse key is:", parseResult['destination_key'] ### h2b.browseTheCloud() start = time.time() # passes with suffix, fails without? # suffix = "" suffix = ".hex" for k in range(54): # try the funky c(6) thing like R, instead of just 6 execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(k+1) + ")]" print "execExpr:", execExpr h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(k) + suffix, timeoutSecs=4) for node in h2o.nodes: storeView = h2o_cmd.runStoreView(node=node, noPrint=True) numKeys = len(storeView['keys']) # number of keys should = k + 2? (on each node) self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \ (numKeys, node, k + 2)) # (numKeys, node, k+2, h2o.dump_json(storeView))) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_parse_fs_schmoo_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" # rowData = "1,0,65,1,2,1,1.4,0,6" rowData = "1,0,65,1,2,1,1,0,6" totalRows = 99860 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and hex_key names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) totalRows += 1 start = time.time() key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=hex_key) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_exec2_sum(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser()=='jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_100000x1000_15f.data' csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_1000000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "result:", result h2o.check_sandbox_for_errors()
def test_exec2_operators(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' csvPathname = 'standard/covtype.data' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print h2o.dump_json(resultExec) print 'exec end took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key='a.hex') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_sort_of_prostate_with_row_schmoo(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() write_syn_dataset(csvPathname, 1, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" for trial in range (100): rowData = rand_rowData() num = random.randint(1, 10096) append_syn_dataset(csvPathname, rowData, num) start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, timeoutSecs=70, pollTimeoutSecs=60) print "trial #", trial, "with num rows:", num, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=key2) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind $v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ $v #1))' # execExpr1 = '(+ $v #1)' # add to itself? execExpr1 = '(+ $v $v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ $v2 "null" #0)))' # execExpr2 = '(= !v2 (anon $v2))' execExpr2 = '(= !v2 (+ $v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec2_operators4(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for t in range(200): execExpr = random.choice(exprList) bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 2 expressions. # Also: functions must be solitary # Also: ifelse() must be solitary # Also: ternary operators must be solitary if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def setUpClass(cls): start = time.time() h2o_hosts.build_cloud_with_hosts(node_count, base_port=base_port, use_flatfile=True, java_heap_GB=1) print "Cloud of", len(h2o.nodes), "built in", time.time()-start, "seconds" h2o.verify_cloud_size() h2o.check_sandbox_for_errors()
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() totalRows = 1000000 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" # used to fail around 50 iterations..python memory problem for trial in range (40): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, rowData, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=150, pollTimeoutSecs=150) print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_NOPASS_GLM2_weight_nan_fail(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') kwargs = { 'destination_key': 'GLM_model_python_0_default_0', 'family': 'tweedie', 'tweedie_variance_power': 1.9999999, 'max_iter': 10, 'alpha': 0, 'lambda': 0, 'response': 54, } for trial in range(3): # params is mutable. This is default. start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) h2o.check_sandbox_for_errors() # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_rapids_funs_basic2(self): if 1==1: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' else: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(5): for execExpr in funsList: funs = '[%s]' % execExpr execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4) execExpr2 = '(= !junk (apply %r1 #2 %anon))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, **kwargs): ## if h2o.beta_features: ## print "HACK: temporarily disabling Summary always in v2 import_parse" ## doSummary = False if not node: node = h2o.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, **kwargs) h2o.verboseprint("importPattern:", importPattern) h2o.verboseprint("importResult", h2o.dump_json(importResult)) parseResult = parse_only(node, importPattern, hex_key, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) h2o.verboseprint("parseResult:", h2o.dump_json(parseResult)) # do SummaryPage here too, just to get some coverage if doSummary: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up h2o.check_sandbox_for_errors() node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing h2o.check_sandbox_for_errors() return parseResult
def test_rf_float_rand2_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData) for trial in range (5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? kwargs = {'ntrees': 5, 'max_depth': 5} parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_prostate_then_prostate_long_parse(self): print "\nput and parse of same file, but both key and key2 are the h2o defaults..always different" for trial in range(10): start = time.time() key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate_long.csv.gz")) print "trial #", trial, "parse end on ", "prostate_long.csv.gz", "took", time.time() - start, "seconds" h2o.check_sandbox_for_errors()
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(100e6),int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_parse_time(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = None colCount = COLS # rowCount = 1000 rowCount = ROWS write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range(20): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) # src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "A trial #", trial # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(key=pA.parse_key, csvPathname=csvDownloadPathname) # do a little testing of saving the key as a csv # remove the original parsed key. source was already removed by h2o if 1 == 0: h2o.nodes[0].remove_key(pA.parse_key) # interesting. what happens when we do csv download with time data? parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key) print "B trial #", trial pB = h2o_cmd.ParseObj(parseResultB, expectedNumRows=rowCount, expectedNumCols=colCount) print pB.numRows print pB.numCols print pB.parse_key iB = h2o_cmd.InspectObj(pB.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) # these checks are redundant now self.assertEqual( iA.missingList, iB.missingList, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( iA.numCols, iB.numCols, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # so I guess that's okay. So allow for an extra row here. self.assertEqual(iA.numRows, iB.numRows, "pA.numRows: %s pB.numRows: %s mismatch after re-parse of downloadCsv result" % \ (iA.numRows, iB.numRows) ) print "H2O writes the internal format (number) out for time." # ==> syn_time.csv <== # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # ==> csvDownload.csv <== # "0","1","2","3","4","5" # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12 h2o.check_sandbox_for_errors()
def tearDown(self): h2o.check_sandbox_for_errors()
def test_csv_download_libsvm(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5000, 10000, 'cK', 120), (10000, 10000, 'cL', 120), (50000, 10000, 'cM', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: trial += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) start = time.time() # Summary is kind of slow. should I do it separately parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "\nA Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) numColsA = inspect['numCols'] numRowsA = inspect['numRows'] byteSizeA = inspect['byteSize'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" print "\nStarting csv download to", csvDownloadPathname, "rowCount:", rowCount, "colCount:", colCount start = time.time() h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) print "csv_download end.", 'took', time.time( ) - start, 'seconds. Originally from:', csvFilename # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print "\nB Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) numColsB = inspect['numCols'] numRowsB = inspect['numRows'] byteSizeB = inspect['byteSize'] self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result %d %d" % (numColsA, numColsB)) self.assertEqual( numRowsA, numRowsB, "numRows mismatches after re-parse of downloadCsv result %d %d" % (numRowsA, numRowsB)) if DO_BYTESIZE_COMPARE: self.assertEqual( byteSizeA, byteSizeB, "byteSize mismatches after re-parse of downloadCsv result %d %d" % (byteSizeA, byteSizeB)) h2o.check_sandbox_for_errors()
def test_parse_csv_download(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() totalRows = 1000000 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" # failed around 50 trials..python memory problem for trial in range(20): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, rowData, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname) num_colsA = inspect['num_cols'] num_rowsA = inspect['num_rows'] row_sizeA = inspect['row_size'] value_size_bytesA = inspect['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result") self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result") self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result") self.assertEqual( value_size_bytesA, value_size_bytesB, "value_size_bytes mismatches after re-parse of downloadCsv result" ) h2o.check_sandbox_for_errors()
def test_rapids_basic_with_funs_inc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(2): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0;#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon { v } %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v1 (anon %v1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey=None, timeoutSecs=5) print "result:", result # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) if noPoll: if (i+1) < len(csvFilenameList): h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern, csvPathname = importFolderPathFull + "/" + csvFilepattern start = time.time() parseResult = h2i.import_parse(path=csvPathname, hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
def sub_c3_nongz_fvec_long(self, csvFilenameList): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: x.remove(i) ignore_x.append(i) x.remove(378) # add one since we are no longer 0 based offset x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "/home/0xdiag/datasets/libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1, 2, True, True), ("mnist_train.svm", "cM", 30, 0, 9, False, False), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20, False, False), ("connect4.svm", "cB", 30, -1, 1, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1, False, False), ("mushrooms.svm", "cG", 30, 1, 2, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvPathname, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = inspectFirst['cols'][0]['min'] imax = inspectFirst['cols'][0]['max'] if expectedCol0Min: self.assertEqual( imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: self.assertEqual( imax, expectedCol0Max, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect( inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(key=key2, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for key2B # h2o.nodes[0].remove_key(key2) start = time.time() key2B = key2 + "_B" parseKeyB = h2o_cmd.parseFile(csvPathname=csvDownloadPathname, key2=key2B) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=key2B) missingValuesListB = h2o_cmd.infoFromInspect( inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i, d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception( "testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) self.assertEqual( value_size_bytesA, value_size_bytesB, "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 1: # importFolderPath = '/home/0xdiag/datasets/more1_1200_link' # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz' # importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath # this pattern from browser correctly does 100 files, 1M rowsj # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz csvFilenameAll = [ ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), # ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), # ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), # ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 180 retryDelaySecs = 10 localhost = h2o.decide_if_localhost() if localhost: tryHeap = 4 h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: tryHeap = 28 h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 h2o.beta_features = True for trial in range(trialMax): # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*") if DO_IMPORT_CHECK: for i in range(2): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") csvPathname = importFolderPath + "/" + csvFilepattern start = time.time() parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) if noPoll: if (i + 1) < len(csvFilenameList): h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern, csvPathname = importFolderPathFull + "/" + csvFilepattern start = time.time() parseResult = h2i.import_parse( path=csvPathname, hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect( None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) if (i + 2) < len(csvFilenameList): h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] csvPathname = importFolderPathFull + "/" + csvFilepattern parseResult = h2i.import_parse( path=csvPathname, hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect( None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseResult['response'][ 'time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # BUG here? if not noPoll: pass # We should be able to see the parse result? # h2o_cmd.check_enums_from_inspect(parseResult) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseResult['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRF takes the parseResult directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** # print "Waiting 30 secs" # time.sleep(30) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.nodes[0].remove_all_keys() ### time.sleep(3600) ### h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_time_rand_fvec_NOPASS(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename colCount = 6 rowCount = 10 headerData = rand_header(colCount) write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (1): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsA = inspect['numRows'] numColsA = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsA, numRows=numRowsA, noPrint=True) print summaryResult h2o_cmd.infoFromSummary(summaryResult) (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictA or enumSizeDictA: raise Exception("Should be empty? constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA)) print "missingValuesListA", missingValuesListA # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsB = inspect['numRows'] numColsB = inspect['numCols'] print "missingValuesListB", missingValuesListB summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsB, numRows=numRowsB, noPrint=True) (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictB or enumSizeDictB: raise Exception("Should be empty? constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB)) self.assertEqual(missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # but in this dataset we have a header too, so the row counts should be equal # if not, maybe the parse of our dataset didn't detect a row self.assertEqual(numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) ) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_exec2_xorsum2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(3): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES - 1) (expectedUllSum, expectedFpSum) = write_syn_dataset( csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for repeate in range(3): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues if ullResult != expectedUllSum and ( abs(ullResult - expectedUllSum) > ALLOWED_DELTA): emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) if STOP_ON_ERROR: raise Exception(emsg) else: print emsg # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)