def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols): # will have to live with random extract. will create variance print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)" # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions execExpr = "" STUPID_REPEAT = 20 for i in range(STUPID_REPEAT): execExpr += "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass) execExpr += "factor(%s[, %s]);" % (trainDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15) inspect = h2o_cmd.runInspect(key=trainDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) ) print "test: same, but use the same runif() random result, complement" execExpr = "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass) execExpr += "factor(%s[, %s])" % (testDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=testDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
def test_GLM2_covtype_single_cols(self): h2o.beta_features = True timeoutSecs = 120 csvPathname = "standard/covtype.data" print "\n" + csvPathname # columns start at 0 y = 54 ignore_x = "" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key="A.hex", timeoutSecs=15 ) case = 2 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "GLM binomial ignoring 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(1, 53): if ignore_x == "": ignore_x = "C" + str(colX) else: # x = x + "," + str(colX) ignore_x = "C" + str(colX) sys.stdout.write(".") sys.stdout.flush() print "y:", y start = time.time() kwargs = {"ignored_cols": ignore_x, "response": y, "n_folds": 6} glm = h2o_cmd.runGLM(parseResult={"destination_key": "A.hex"}, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
def test_exec2_covtype_cols(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=30) print "\nParse key is:", parseResult['destination_key'] ### h2b.browseTheCloud() start = time.time() # passes with suffix, fails without? # suffix = "" suffix = ".hex" for k in range(54): # try the funky c(6) thing like R, instead of just 6 execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(k+1) + ")]" print "execExpr:", execExpr h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(k) + suffix, timeoutSecs=4) for node in h2o.nodes: storeView = h2o_cmd.runStoreView(node=node, noPrint=True) numKeys = len(storeView['keys']) # number of keys should = k + 2? (on each node) self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \ (numKeys, node, k + 2)) # (numKeys, node, k+2, h2o.dump_json(storeView))) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def test_exec2_operators(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' csvPathname = 'standard/covtype.data' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print h2o.dump_json(resultExec) print 'exec end took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key='a.hex') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_exec2_reduction(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser()=='jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_100000x1000_15f.data' csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_1000000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print "result:", result for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "result:", result assert result==1 h2o.check_sandbox_for_errors()
def test_exec2_na_chop(self): bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key='i.hex') print "\nr.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows1 = inspect['numRows'] numCols = inspect['numCols'] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1) inspect = h2o_cmd.runInspect(key='s.hex') print "\ns.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows2 = inspect['numRows'] print numRows1, numRows2 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_result_race(self): ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) # zero the list of Results using node[0] # FIX! is the zerolist not eing seen correctl? is it not initializing to non-zero? for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=20) ### print "\nexecResult:", execResult trial = 0 while (trial < 200): for execExpr in exprList: # for the first 100 trials: do each expression at node 0, # for the second 100 trials: do each expression at a random node, to facilate key movement # FIX! there's some problem with the initList not taking if rotated amongst nodes? if (trial < 100): nodeX = 0 else: nodeX = random.randint(0,lenNodes-1) resultKey = "Result.hex" execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey=resultKey, timeoutSecs=20) print min_value, execExpr h2o.verboseprint("min_value: ", min_value, "trial:", trial) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") trial += 1
def test_rapids_funs_basic2(self): if 1==1: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' else: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(5): for execExpr in funsList: funs = '[%s]' % execExpr execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4) execExpr2 = '(= !junk (apply %r1 #2 %anon))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_GLM2_params_rand2(self): csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, 'n_folds': 1, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() if 'family' not in kwargs or kwargs['family']=='binomial': bHack = {'destination_key': 'B.hex'} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_exec2_operators4(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for t in range(200): execExpr = random.choice(exprList) bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 2 expressions. # Also: functions must be solitary # Also: ifelse() must be solitary # Also: ternary operators must be solitary if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_poppush_fail(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) exprList = [] while (len(exprList)!=20): exprs = [random.choice(phrases) for j in range(random.randint(1,2))] # check if we have mean2() before function defn functionFound = False for e in exprs: if 'function' in e: functionFound = True if 'mean2' in e and not functionFound: # add the function definition first exprs = ["mean2=function(x){apply(x,1,sum)/nrow(x)};"] + exprs exprList.append("".join(exprs)) # add this one for good measure (known fail) exprList += "r.hex-r.hex; mean2=function(x){apply(x,1,sum)/nrow(x)}; mean2(r.hex); r.hex[,ncol(r.hex)+1]=4;" for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v2 (+ %v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_GLM2_model_key_unique(self): h2o.beta_features = True modelKeyDict = {} for trial in range (1,5): csvPathname = 'iris/iris2.csv' start = time.time() # make sure each parse is unique dest key (not in use hex_key = "iris2_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) y = 4 execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # h2o.py now sets destination_key for a fixed default model name, # we want h2o to create model names for this test, so use none here kwargs = {'destination_key': None, 'response':4, 'family': 'gaussian'} glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, noPoll=True, **kwargs ) print "GLM #%d" % trial, "started on ", csvPathname, 'took', time.time() - start, 'seconds' model_key = glmResult['destination_key'] print "GLM model_key:", model_key if model_key in modelKeyDict: raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key])) modelKeyDict[model_key] = trial # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(100e6),int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, outputClass=None, outputCol=None, changeToBinomial=False): # will have to live with random extract. will create variance print "train: get random", trainPercent print "test: get remaining", 100 - trainPercent if changeToBinomial: print "change class", outputClass, "to 1, everything else to 0. factor() to turn real to int (for rf)" boundary = (trainPercent + 0.0)/100 execExpr = "" execExpr += "cct.hex=runif(%s,-1);" % srcKey execExpr += "%s=%s[cct.hex<=%s,];" % (trainDstKey, srcKey, boundary) if changeToBinomial: execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, outputCol+1, trainDstKey, outputCol+1, outputClass) execExpr += "factor(%s[, %s]);" % (trainDstKey, outputCol+1) h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=30) inspect = runInspect(key=trainDstKey) infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) ) print "test: same, but use the same runif() random result, complement comparison" execExpr = "" execExpr += "%s=%s[cct.hex>%s,];" % (testDstKey, srcKey, boundary) if changeToBinomial: execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, outputCol+1, testDstKey, outputCol+1, outputClass) execExpr += "factor(%s[, %s])" % (testDstKey, outputCol+1) h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=30) inspect = runInspect(key=testDstKey) infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
def test_GLM2_covtype_single_cols(self): timeoutSecs = 120 csvPathname = 'standard/covtype.data' print "\n" + csvPathname # columns start at 0 y = 54 ignore_x = "" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='A.hex', timeoutSecs=15) case = 2 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "GLM binomial ignoring 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(1,53): if ignore_x == "": ignore_x = 'C' + str(colX) else: # x = x + "," + str(colX) ignore_x = 'C' + str(colX) sys.stdout.write('.') sys.stdout.flush() print "y:", y start = time.time() kwargs = {'ignored_cols': ignore_x, 'response': y, 'n_folds': 6 } glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'}, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def execit(n, bucket, path, src_key, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30): np1 = (n+1) % len(h2o.nodes) np = (n) % len(h2o.nodes) # doesn't work cause we can't have racing writers # execExpr = "r2 = (r2==%s) ? %s+1 : %s" % (np1, np1) if np == 0: if READ_ONLY: execExpr = "(r%s==1) ? c(1) : c(0);" % np else: execExpr = "r%s = c(1)" % np1 print "Sending request to node: %s" % h2o.nodes[np1], h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) else: # flip to one if the prior value is 1 (unless you're the zero case if READ_ONLY: execExpr = "(r%s==1) ? c(1) : c(0);" % np else: execExpr = "r%s = (r%s==1) ? c(1) : c(0);" % (np1, np) print "Sending request to node: %s" % h2o.nodes[np1], (resultExec, fpResult) = h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) while fpResult != 1: print "to node: %s" % h2o.nodes[np1] (resultExec, fpResult) = h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) hex_key = np1 return hex_key
def test_exec2_ddply_phrases(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' # csvPathname = 'standard/covtype.data' csvPathname = "standard/covtype.shuffled.10pct.data" hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey) for col in range(1,10): initList = [ ('r.hex', 'r.hex=i.hex'), (None, "func1=function(x){max(x[,%s])}" % col), (None, "func2=function(x){a=3;nrow(x[,%s])*a}" % col), (None, "func3=function(x){apply(x[,%s],2,sum)/nrow(x[,%s])}" % (col, col) ), # (None, "function(x) { cbind( mean(x[,1]), mean(x[,%s]) ) }" % col), (None, "func4=function(x) { mean( x[,%s]) }" % col), (None, "func5=function(x) { sd( x[,%s]) }" % col), # (None, "func6=function(x) { quantile(x[,%s] , c(0.9) ) }" % col), ] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) for p in phrases: execExpr = "ddply(r.hex, c(2), " + p + ")" h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
def test_exec2_operators2(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for execExpr in exprList: bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 5 expressions and see what happens if expCnt > 2: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_50_nongz_fvec(self): avgMichalSize = 237270000 bucket = "home-0xdiag-datasets" importFolderPath = "manyfiles-nflx-gz" print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") importFullList = importResult["files"] importFailList = importResult["fails"] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=600 ) execExpr = "A.hex=%s" % parseResult["destination_key"] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.runStoreView(timeoutSecs=60)
def test_exec2_poppush2_fail(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) exprList = [] while (len(exprList)!=20): exprs = [random.choice(phrases) for j in range(random.randint(1,2))] # check if we have mean2() before function defn functionFound = False for i, e in enumerate(exprs): if 'function' in e: functionFound = True # h2o has problems with assigns after functions if functionFound and len(exprs)> 1: # pass exprList.append("".join(exprs)) else: exprList.append("".join(exprs)) # add this one for good measure (known fail) # exprList += "crunk=function(x){x+98};r.hex[,3]=4;" exprList += ["function(x){x+98};r.hex[,3]=4;"] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
def test_exec2_multi_node(self): h2o.beta_features = True for n, node in enumerate(h2o.nodes): print "n:", n np1 = (n+1) % len(h2o.nodes) np = n % len(h2o.nodes) # get this key known to this node print "Init with independent targets. No shared target" execExpr = "r%s = c(0)" % np1 print "Sending request to node: %s" % h2o.nodes[np1] h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) # test the store expression execExpr = "(r%s==0)" % np1 print "Sending request to node: %s" % h2o.nodes[np1] h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) global OUTSTANDING if not OUTSTANDING: OUTSTANDING = min(10, len(h2o.nodes)) execTrial = 0 worker_resultq = multiprocessing.Queue() while execTrial <= TRIALMAX: start = time.time() workers = [] for o in range(OUTSTANDING): np = execTrial % len(h2o.nodes) retryDelaySecs = 5 timeoutSecs = 60 bucket = None csvPathname = None src_key = None hex_key = 'a' tmp = multiprocessing.Process(target=function_no_keyboard_intr, args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() workers.append(tmp) execTrial += 1 # Exec doesn't get tracked as a job. So can still have outstanding # now sync on them for worker in workers: try: # this should synchronize worker.join() print "worker joined:", worker # don't need him any more worker.terminate() hex_key = worker_resultq.get(timeout=2) except KeyboardInterrupt: print 'parent received ctrl-c' for worker in workers: worker.terminate() worker.join() elapsed = time.time() - start print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def doAll(case): keys = [] trial = 0 for execExpr in exprList: # 4x4 cases per expression colons = [ # requires only 1 value on rhs '#0 #0', # '"null" #0', # '#0 "null"', # '"null" "null"', ] for colon in colons: # what if the destination doesn't exist?. Use unique name for each, to see t = "t%s" % trial cases = [ # no colon '(= !{} {})'.format(t, execExpr), # colon lhs # '(= ([ %%s %s) %s)' % (t, colon, execExpr), # colon rhs # '(= !%s ([ %s %s))' % (t, execExpr, colon), # colon lhs and rhs '(= ([ %{} {}) ([ {} {}))'.format(t, colon, execExpr, colon), ] for case in cases: # init the data frame first to 0 (1 row, 1 col) print "\nt:", t, "case:", case # can't init it to empty '(= !%s (c {#0})' % t execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4) # colonize it, to see if it blows up! # since they all are assigns, they all are wrapped by '(= !<lhs> ...) # unwrap the inner and wrap it with a colon then wrap it with the assign # change the lhs to be coloned (row and/or col) and change the rhs to be a colon # so four cases # make sure the lhs assign key exists first execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4) # rows/cols could be zero # if execResult['num_rows'] or execResult['num_cols']: # I think if key is not null, then that means a key got created # oh, but exec deletes ones with leading "_" immediately? those are temp keys # we'll put them in the list and see if we see them if execResult['key']: keys.append(execExpr) trial += 1 print "\nExpressions that created keys" for k in keys: print k if re.match('_', k): raise Exception("%s I didn't expect any keys with leading underscores." + "\nDoesn't spencer delete those so I can't read them?" % k) h2o.check_sandbox_for_errors()
def test_parse_cust(self): # run as user 0xcustomer to get access (with .json config and ssh key file specified) importFolderPath = '/mnt/0xcustomer-datasets' pollTimeoutSecs = 120 retryDelaySecs = 30 timeoutSecs = 300 (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*") importFileList = importResult['files'] importFailList = importResult['fails'] importKeyList = importResult['keys'] importDelList = importResult['dels'] if len(importDelList)!=0: raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList)) if len(importFileList)<MINFILES: raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList)) if len(importKeyList)<MINFILES: raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList)) if len(importFailList)!=0: raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList)) # only parse files with .csv or .tsv in their name (no dirs like that?) goodKeyList = [key for key in importKeyList if ('.csv' in key or '.tsv' in key)] trial = 0 # just do 1? for i, importKey in enumerate(random.sample(goodKeyList,3)): print "importKey:", importKey trial +=1 start = time.time() # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like # force header=0..should mean headers get treated as NAs parseResult = h2i.parse_only(pattern=importKey, header=0, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] origKey = parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=origKey) h2o_cmd.infoFromInspect(inspect, origKey) execExpr = 'newKey = '+origKey+'[1,1]' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) newParseKey = {'destination_key': 'newKey'} h2o_cmd.checkKeyDistribution() h2o.nodes[0].remove_key(key=origKey) # a key isn't created for a scalar # h2o.nodes[0].remove_key(key='newKey') self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
def test_B_claim_prediction_binomial(self): csvPathname = 'allstate/claim_prediction_train_set_10000_int.csv.gz' kwargs = {'family': 'binomial', 'response': 'Claim_Amount', 'alpha': 0, 'lambda': 0.5, 'max_iter': 15} parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key='A.hex') execExpr = 'A.hex[,35] = A.hex[,35]>100' h2o_exec.exec_expr(execExpr=execExpr) parseResult['destination_key'] = 'A.hex' glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=150, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_exec2_cbind_fail2(self): for i in range(5): execExpr = "a=c(0,0,0); b=c(0,0,0)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o.check_sandbox_for_errors()
def test_parse_manyfiles_1(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key, delete_on_done=DELETE_ON_DONE, # importParentDir=IMPORT_PARENT_DIR, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) # convert to binomial if DO_EXEC: execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) if DO_DELETE_MYSELF: h2o_import.delete_keys_at_all_nodes() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_exec2_multi_node3(self): for initTrial in range(1): for node in h2o.nodes: # get this key known to this node execExpr = "r0 = c(0,0); r1 = c(0,0); r2 = c(0,0);" print "Sending request to node: %s" % node h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30) if TEST_MUX_STORE: # test the store expression execExpr = "(r1==c(0,0)) ? c(0,0) : c(1,1)" print "Sending request to node: %s" % node h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30) global OUTSTANDING if not OUTSTANDING: OUTSTANDING = min(10, len(h2o.nodes)) execTrial = 0 worker_resultq = multiprocessing.Queue() while execTrial <= TRIALMAX: start = time.time() workers = [] for o in range(OUTSTANDING): np = execTrial % len(h2o.nodes) retryDelaySecs = 5 timeoutSecs = 60 bucket = None csvPathname = None src_key = None hex_key = 'a' tmp = multiprocessing.Process(target=function_no_keyboard_intr, args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() workers.append(tmp) execTrial += 1 # Exec doesn't get tracked as a job. So can still have outstanding # now sync on them for worker in workers: try: # this should synchronize worker.join() print "worker joined:", worker # don't need him any more worker.terminate() hex_key = worker_resultq.get(timeout=2) except KeyboardInterrupt: print 'parent received ctrl-c' for worker in workers: worker.terminate() worker.join() elapsed = time.time() - start print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_GLM2_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # H2O might not do whitespace stripping on numbers correctly, when , is {SEP} # GLM will auto expand categoricals..so if we have more coefficients than expected # that means it didn't parse right # mix in space/tab combos # just done like this for readability rowDataTrueRaw = \ "<sp>1,\ 0<sp>,\ <tab>65,\ 1<tab>,\ <sp><tab>2,\ 1<sp><tab>,\ <tab><sp>1,\ 4<tab><sp>,\ <tab><tab>1,\ 4<tab><tab>,\ <sp><sp>1,\ 4<sp><sp>" rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw) rowDataTrue = re.sub("<tab>"," ", rowDataTrue) rowDataFalse = \ "0,\ 1,\ 0,\ -1,\ -2,\ -1,\ -1,\ -4,\ -1,\ -4,\ -1,\ -3" twoValueList = [ # (0,1,0, 12), # (0,1,1, 12), # ('A','B',0, 12), # ('A','B',1, 12), (-1,1,-1, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, expectedCoeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) hex_key = csvFilename + "_" + str(trial) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr="A.hex=%s" % hex_key h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (13, 13, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'A.hex'} start = time.time() kwargs = { 'n_folds': 0, 'response': 'C13', 'family': 'binomial', 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.0002 } # default takes 39 iterations? play with alpha/beta print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) glm = h2o_cmd.runGLM(parseResult=aHack, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # check that the number of entries in coefficients is right (12 with intercept) coefficients_names = glm['glm_model']['coefficients_names'] print "coefficients_names:", coefficients_names # subtract one for intercept actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1 if (actualCoeffNum!=expectedCoeffNum): raise Exception("Should be %s expected coefficients in result. actual: %s" % (expectedCoeffNum, actualCoeffNum)) print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() trial += 1
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey + "=" + hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def test_exec2_xorsum2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(3): ullResultList = [] NUM_FORMAT_CASES = h2o_util.fp_format() for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname sel = random.randint(0, NUM_FORMAT_CASES-1) (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for repeate in range(3): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way. needed when integers are parsed # okay for a couple of lsbs to be wrong, due to conversion from stringk # ullResult (0.16x): 0x02c1a21f923cee96 2.15698793923e-295 # expectedUllSum (0.16x): 0x02c1a21f923cee97 2.15698793923e-295 # expectedFpSum (0.16x): 0x42f054af32b3c408 2.87294442126e+14 # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them. # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues ALLOWED_BIT_ERR = 0x1f # seeing this amount of error! if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR): emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) if STOP_ON_ERROR: raise Exception(emsg) else: print emsg # print "%30s" % "hex(bitResult):", hex(ullResult) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_exec2_quantile_na_scalar(self): h2o.beta_features = True for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print 'exec end took', time.time() - start, 'seconds' h2p.blue_print("h2o exec quantiles result:", result) self.assertEqual( result, expectedP, msg="Checking exec quantiles median, expectedP: %s result: %s" % (expectedP, result)) print h2o.dump_json(resultExec) # do the quantiles page on the created key kwargs = { 'column': 0, 'quantile': QUANTILE, 'multiple_pass': 2, 'max_qbins': 1000, } q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertEqual(qresult_iterations, 3, msg="should take 3 iterations") # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating") self.assertEqual( qresult, expectedP, msg="Checking quantilespage median, expectedP: %s result: %s" % (expectedP, qresult)) inspect = h2o_cmd.runInspect(key='abc') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1200), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 10 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) noPoll = False benchmarkLogging = ['cpu', 'disk', 'iostats', 'jstack'] pollTimeoutSecs = 120 retryDelaySecs = 10 for (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in csvFilenameList: localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_rapids_cbind_vec(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 10 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): for trial in [int(10e6)]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # cols = 100 xList = [] eList = [] fList = [] for trial2 in range(0, 5): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): col = 2 ** trial2 # assert col < 16384, "h2o can't take col == 16384 or more" vString = ' '.join(['%v' for x in range(col)]) execExpr = '(= !v2 (cbind %s))' % vString # FIX! check the colnames. 2 cols get C1 and C10? odd # try: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # except: # elapsed2 = 0 # h2p.red_print("ERROR: col = %s failed" % col) if 1==0: start = time.time() execExpr = '(sum %v2 %TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(col) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'col' eLabel = 'elapsed (sum)' fLabel = 'elapsed (cbind cols)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rf_covtype_train_oobe3(self): print "\nUse randomFilter to sample the dataset randomly. then slice it" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # FIX! too many digits (10) in the 2nd param seems to cause stack trace execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_exec2_cbind_fail3(self): for i in range(5): execExpr = "h <- cbind(c(0,0,0), c(1,1,1))" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # have to make sure they're created as keys for reuse between execs execExpr = "a=c(0,0,0); b=c(0,0,0); d=c(0,0,0); e=c(0,0,0); f=c(0,0,0); g= c(0,0,0);" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "b=a; d=a; f=a; g=a;" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b, d)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b, d, e)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b, d, e, f)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "h <- cbind(a, b, d, e, f, g)" h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o.check_sandbox_for_errors()
def test_GLM2_covtype_train_predict_all_all(self): importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertEqual(pctWrong, trainPctWrong,"Should see the same error rate on train and predict? (same data set)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_parse_covtype_loop_fvec(self): h2o.beta_features = True # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True global OUTSTANDING if not OUTSTANDING: OUTSTANDING = min(10, len(h2o.nodes)) if DO_IRIS: global DO_BIGFILE DO_BIGFILE = False bucket = 'smalldata' importFolderPath = "iris" csvFilename = "iris2.csv" csvFilePattern = "iris2.csv" if localhost: trialMax = 20 else: trialMax = 100 elif DO_BIGFILE: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype20x.data" csvFilePattern = "covtype20x.data" trialMax = 2 * OUTSTANDING else: bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype.data" csvFilePattern = "covtype.data" trialMax = 40 * OUTSTANDING # add one just to make it odd # OUTSTANDING = min(10, len(h2o.nodes) + 1) # don't have more than one source file per node OUTSTANDING? (think of the node increment rule) # okay to reuse the src_key name. h2o deletes? use unique hex to make sure it's not reused. # might go to unique src keys also ..oops have to, to prevent complaints about the key (lock) # can't repeatedly import the folder # only if not noPoll. otherwise parse isn't done # I guess I have to use 'put' so I can name the src key unique, to get overlap # I could tell h2o to not delete, but it's nice to get the keys in a new place? # maybe rebalance? FIX! todo parseTrial = 0 summaryTrial = 0 uploader_resultq = multiprocessing.Queue() while parseTrial <= trialMax: start = time.time() uploaders = [] if not DO_IRIS: assert OUTSTANDING<=10 , "we only have 10 links with unique names to covtype.data" for o in range(OUTSTANDING): src_key = csvFilename + "_" + str(parseTrial) hex_key = csvFilename + "_" + str(parseTrial) + ".hexxx" # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", # hacked hard ln so source keys would have different names? was getting h2o locking issues if DO_IRIS: csvPathname = importFolderPath + "/" + csvFilePattern else: csvPathname = importFolderPath + "/" + csvFilePattern + "_" + str(o) start = time.time() # walk the nodes # if this rule is matched for exec/summary below, it should find the name okay? (npe with xorsum) # summary2 not seeing it? np = parseTrial % len(h2o.nodes) retryDelaySecs=5 if DO_BIGFILE else 1 timeoutSecs=60 if DO_BIGFILE else 15 tmp = multiprocessing.Process(target=function_no_keyboard_intr, args=(uploader_resultq, uploadit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() uploaders.append(tmp) parseTrial += 1 # now sync on them for uploader in uploaders: try: uploader.join() # don't need him any more uploader.terminate() (importPattern, hex_key) = uploader_resultq.get(timeout=2) except KeyboardInterrupt: print 'parent received ctrl-c' for uploader in uploaders: uploader.terminate() uploader.join() elapsed = time.time() - start print "Parse group end at #", parseTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We might have parses that haven't completed. The join just says we can reuse some files (parse still going)" if PARSE_NOPOLL: h2o_jobs.pollWaitJobs(timeoutSecs=180) h2o_cmd.runStoreView() # h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=0.25) if DO_PARSE_ALSO: # only if we parsed print "These all go to node [0]" # getting a NPE if I do xorsum (any exec?) ..just do summary for now..doesn't seem to have the issue # suspect it's about the multi-node stuff above for summaryTrial in range(trialMax): # do last to first..to get race condition? firstXorUll = None firstQuantileUll = None hex_key = csvFilename + "_" + str(summaryTrial) + ".hexxx" if DO_EXEC_QUANT: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "median ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstQuantileUll: self.assertEqual(ullResult, firstQuantileUll) else: firstQuantileUll = ullResult if DO_XORSUM: execExpr = "r2=c(1); r2=xorsum(%s[,1], c(%s));" % (hex_key, thresholds) (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "xorsum ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) if firstXorUll: self.assertEqual(ullResult, firstXorUll) else: firstXorUll = ullResult if DO_SUMMARY: h2o_cmd.runSummary(key=hex_key)
def sub_c3_nongz_fvec_long(self, csvFilenameList): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # remove the output too! (378) ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1)**2 h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_covtype_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', } timeoutSecs = 180 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_ddply_plot(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [ (1000000, 5, 'cD', 0, 320, 30), ] else: tryList = [ # (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), # (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), # (1000000, 5, 'cD', 0, 80, 30), (1000000, 5, 'cD', 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append( (1000000, 5, 'cD', 0, 160, 30), ) tryList.append( (1000000, 5, 'cD', 0, 320, 30), ) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt-minInt)+1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(1): # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename) self.assertEqual(missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) #***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual(missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual(missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=120) print "execResult", h2o.dump_json(execResult) #***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual(result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult))) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_multi_node(self): h2o.beta_features = True for n, node in enumerate(h2o.nodes): print "n:", n np1 = (n + 1) % len(h2o.nodes) np = n % len(h2o.nodes) # get this key known to this node print "Init with independent targets. No shared target" execExpr = "r%s = c(0)" % np1 print "Sending request to node: %s" % h2o.nodes[np1] h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) # test the store expression execExpr = "(r%s==0) ? c(0) : c(1)" % np1 print "Sending request to node: %s" % h2o.nodes[np1] h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30) global OUTSTANDING if not OUTSTANDING: OUTSTANDING = min(10, len(h2o.nodes)) execTrial = 0 worker_resultq = multiprocessing.Queue() while execTrial <= TRIALMAX: start = time.time() workers = [] for o in range(OUTSTANDING): np = execTrial % len(h2o.nodes) retryDelaySecs = 5 timeoutSecs = 60 bucket = None csvPathname = None src_key = None hex_key = 'a' tmp = multiprocessing.Process( target=function_no_keyboard_intr, args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs)) tmp.start() workers.append(tmp) execTrial += 1 # Exec doesn't get tracked as a job. So can still have outstanding # now sync on them for worker in workers: try: # this should synchronize worker.join() print "worker joined:", worker # don't need him any more worker.terminate() hex_key = worker_resultq.get(timeout=2) except KeyboardInterrupt: print 'parent received ctrl-c' for worker in workers: worker.terminate() worker.join() elapsed = time.time() - start print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_exec2_quant_cmp_uniform(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i != 0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % ( hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print( "\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception( "exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual( result, pctile[i], tol=maxDelta, msg= 'exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key if 1 == 0: execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % ( hex_key, ",".join(map(str, thresholds))) else: # does this way work (column getting)j execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % ( hex_key, ",".join(map(str, thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols, 1) self.assertEqual(numRows, len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6), int(100e6), int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1 == 1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i == 0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v2 (+ %v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1 == 0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect( inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec2_col_add(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if localhost: # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' else: # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) xList = [] eList = [] fList = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) for trial in range(1000): for execExpr in exprList: # put the trial number into the temp for uniqueness execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr) execExpr = re.sub(',1', ',%s' % trial, execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' c = h2o.nodes[0].get_cloud() c = c['nodes'] # print (h2o.dump_json(c)) k = [i['num_keys'] for i in c] v = [i['value_size_bytes'] for i in c] print "keys: %s" % " ".join(map(str, k)) print "value_size_bytes: %s" % " ".join(map(str, v)) # print "result:", result if ('r1' in execExpr) and (not 'apply' in execExpr): xList.append(trial) eList.append(execTime) if ('apply' in execExpr): fList.append(execTime) h2o.check_sandbox_for_errors() # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'time: r1[,1] = Last.value = r2', fLabel = 'time: apply(r1, 2, sum)', eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution() # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'B.hex=A.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'C.hex=B.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'D.hex=C.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution()
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % ( hex_key, y + 1, y + 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold = glm['glm_model']['submodels'][0][ 'validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][ '_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 50, "Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "/home/0xdiag/datasets/standard" csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on", csvFilename parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] + 1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class! for trial in range(8, 9): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str( rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) # hack so the RF will use the sliced result # FIX! don't use the sliced bit..use the whole data for rf training below ### parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial) # kwargs['model_key'] = "model" # double check the rows/cols inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, "going into RF") start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 # double check the rows/cols inspect = h2o_cmd.runInspect(key=dataKeyTest) h2o_cmd.infoFromInspect(inspect, "dataKeyTest") rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * ( 1.0 - rfvScoring['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename( None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble( expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong( expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult != expectedUllSum: raise Exception( "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % ( ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % ( ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % ( expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % ( expectedFpSumAsLongLong, expectedFpSum)
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number? # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded # the prediction is the one that doesn't have it, because it's realated to clusters, which have no # notion of output classes h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predictResult = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' print "predictResult:", h2o.dump_json(predictResult) h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? # hack..need to fix this if 1 == 0: if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath if len(h2o.nodes) == 1: csvFilenameList = [ ("*[1][0][0-9].dat", "file_10_A.dat", 10 * avgMichalSize, 600), ] else: csvFilenameList = [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541 ]: x.remove(i) ignore_x.append(i) # plus 1 because we are no longer 0 offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('C379', 'C379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 0: importFolderPath = '/home/0xdiag/datasets/10k_small_gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize, 700 ), ("[1][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize, 700), ("[1][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize, 700), ("[1][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize, 700), ("[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize, 700), ("[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_300_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud( 2, java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts( 1, java_heap_GB=tryHeap / 2, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 200, 200), (10, 1000, 'cB', 200, 200), (10, 1000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # import it N times and compare the N hex keys REPEAT = 5 for i in range(REPEAT): hex_key_i = hex_key + "_" + str(i) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key_i, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # compare each to 0 for i in range(1, REPEAT): hex_key_i = hex_key + "_" + str(i) hex_key_0 = hex_key + "_0" print "\nComparing %s to %s" % (hex_key_i, hex_key_0) if 1 == 0: execExpr = "%s[1,]+%s[1,]" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s[,1]+%s[,1]" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s+%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s!=%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s==%s" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "s=sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "s=c(1); s=sum(%s==%s)" % (hex_key_0, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "n=c(1); n=nrow(%s)*ncol(%s))" % (hex_key, hex_key_i) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "r=c(1); r=s==n" resultExec, result, h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "result:", result
def test_four_billion_rows_fvec(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=3) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols), \ " byteSize:", "{:,}".format(byteSize) expectedRowSize = numCols * 1 # plus output # expectedValueSize = expectedRowSize * numRows expectedValueSize = 8001271520 self.assertEqual(byteSize, expectedValueSize, msg='byteSize %s is not expected: %s' % \ (byteSize, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4 * 1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 10, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=4, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'response': 'C1', 'n_folds': 0, 'family': 'binomial', } # one coefficient is checked a little more colX = 1 # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
def test_GLM2_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x GLM will use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x modelKey = "mnist" params = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey } for c in [5]: print "Trying binomial with case:", c execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = params.copy() timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) # Score ********************************************** execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "Problems with test data having different enums than train? just use train for now" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)