def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0} print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): # "grid": { # "destination_keys": [ # "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_2" # ] # }, if h2o.beta_features: destination_key = glmGridResult['grid']['destination_keys'][0] inspectGG = h2o.nodes[0].glm_view(destination_key) models = inspectGG['glm_model']['submodels'] h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0])) g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs) else: destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) models = glmGridResult['models'] for m, model in enumerate(models): alpha = model['alpha'] area_under_curve = model['area_under_curve'] # FIX! should check max error? error_0 = model['error_0'] error_1 = model['error_1'] model_key = model['key'] print "#%s GLM model key: %s" % (m, model_key) glm_lambda = model['lambda'] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key']) h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM)) g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs) return g
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, **kwargs): ## if h2o.beta_features: ## print "HACK: temporarily disabling Summary always in v2 import_parse" ## doSummary = False if not node: node = h2o.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, **kwargs) h2o.verboseprint("importPattern:", importPattern) h2o.verboseprint("importResult", h2o.dump_json(importResult)) parseResult = parse_only(node, importPattern, hex_key, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) h2o.verboseprint("parseResult:", h2o.dump_json(parseResult)) # do SummaryPage here too, just to get some coverage if doSummary: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up h2o.check_sandbox_for_errors() node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing h2o.check_sandbox_for_errors() return parseResult
def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) self.assertAlmostEqual(ce1pct1, ce2pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual(ce1pct1, ce3pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_exec2_operators(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' csvPathname = 'standard/covtype.data' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print h2o.dump_json(resultExec) print 'exec end took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key='a.hex') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_50_nongz_fvec(self): h2o.beta_features = True avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' importFolderPath = 'airlines' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) h2o_cmd.runStoreView(timeoutSecs=60)
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult["destination_key"] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! type = inspectGG["type"] if "unparsed" in type: print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG["response"] # dict ### rows = inspectGG['rows'] value_size_bytes = inspectGG["value_size_bytes"] model0 = glmGridResult["models"][0] alpha = model0["alpha"] area_under_curve = model0["area_under_curve"] error_0 = model0["error_0"] error_1 = model0["error_1"] key = model0["key"] print "best GLM model key:", key glm_lambda = model0["lambda"] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, key) h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM)) simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=30, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] ntree = rfView['ntree'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True paramsTrainRF = {"ntrees": 2, "max_depth": 300, "nbins": 200, "timeoutSecs": 600, "response": "C55"} paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600} trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) # we're doing separate test/train splits..so we're going to get variance # really should not do test/train split and use all the data? if we're comparing sorted or not? # but need the splits to be sorted or not. I think I have those files self.assertAlmostEqual(ce1pct1, ce2pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual(ce1pct1, ce3pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_exec2_quantile_na_scalar(self): h2o.beta_features = True for (execExpr, num) in exprList: start = time.time() resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180) print 'exec end took', time.time() - start, 'seconds' print h2o.dump_json(resultExec) # do the quantiles page on the created nah key kwargs = { 'column': 0, 'quantile': 0.4, 'multiple_pass': 2, } h2o.nodes[0].quantiles(source_key='nah', **kwargs) inspect = h2o_cmd.runInspect(key='abc') numCols = inspect['numCols'] numRows = inspect['numRows'] print "numCols:", numCols print "numRows:", numRows self.assertEqual(numCols, 1) self.assertEqual(numRows, num) h2o.check_sandbox_for_errors()
def test_RF_poker100(self): MISSING_RESPONSE = False DO_MODEL_INSPECT = False trees = ",".join(map(str,range(10,50,2))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append( (job_key, model_key) ) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key) print "speedrf grid result for %s:", h2o.dump_json(gridResult) print "speedrf grid result errors:", gridResult['prediction_errors'] for i,j in enumerate(gridResult['jobs']): if DO_MODEL_INSPECT: print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key'])) else: # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) print "model:", h2o.dump_json(model)
def test_A_store_view(self): # size of H2O store store_size = 0 # import data to have more files in the system r = h2i.import_only(bucket='smalldata', path='iris/*') store_size += len(r[0]['files']) r = h2i.import_only(bucket='smalldata', path='covtype/*') store_size += len(r[0]['files']) # list all items r = h2o.nodes[0].store_view(view=store_size) self.assertEqual(store_size, len(r['keys'])) # list over views including only 3 items items_per_page = 3 # items per page pages = (store_size / items_per_page) # number of pages if (store_size % items_per_page != 0): pages += 1 offset = 0 # running offset cnt_items = 0 # counter of returned items for p in range(0,pages): r = h2o.nodes[0].store_view(offset=offset, view=items_per_page) print h2o.dump_json(r) cnt_items += len(r['keys']) offset += items_per_page self.assertEqual(store_size, cnt_items)
def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] # FIX! does error_0/1 only exist for binomial? for m, model in enumerate(glmGridResult['models']): alpha = model['alpha'] area_under_curve = model['area_under_curve'] # FIX! should check max error? error_0 = model['error_0'] error_1 = model['error_1'] model_key = model['key'] print "#%s GLM model key: %s" % (m, model_key) glm_lambda = model['lambda'] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key']) h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM)) g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs) return g
def exec_list(exprList, lenNodes, csvFilename, key2): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0,lenNodes-1) colX = random.randint(1,54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1,400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result"+str(trial)+".hex", timeoutSecs=60) eri0 = execResultInspect[0] eri1 = execResultInspect[1] columns = eri0.pop('cols') columnsDict = columns[0] print "\nexecResult columns[0]:", h2o.dump_json(columnsDict) print "\nexecResult [0]:", h2o.dump_json(eri0) print "\nexecResult [1] :", h2o.dump_json(eri1) min = columnsDict["min"] h2o.verboseprint("min: ", min, "trial:", trial) ### self.assertEqual(float(min), float(trial),"what can we check here") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] model0 = glmGridResult['models'][0] alpha = model0['alpha'] area_under_curve = model0['area_under_curve'] error_0 = model0['error_0'] error_1 = model0['error_1'] model_key = model0['key'] print "best GBM model key:", model_key glm_lambda = model0['lambda'] # now indirect to the GBM result/model that's first in the list (best) inspectGBM = h2o_cmd.runInspect(None, model_key) h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM)) simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_parse_small_many(self): SEED = 6204672511291494176 random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() # can try the other two possibilities also eol = "\n" row = "a,b,c,d,e,f,g" # need unique key name for upload and for parse, each time # maybe just upload it once? timeoutSecs = 10 node = h2o.nodes[0] # fail rate is one in 200? # need at least two rows (parser) for sizeTrial in range(10): size = random.randint(2,129) print "\nparsing with rows:", size csvFilename = "p" + "_" + str(size) csvPathname = SYNDATASETS_DIR + "/" + csvFilename writeRows(csvPathname,row,eol,size) key = csvFilename print h2o.dump_json(key) for trial in range(5): # data key is deleted after parse now, so have to put it again pkey = node.put_file(csvPathname, key=key, timeoutSecs=timeoutSecs) key2 = csvFilename + "_" + str(trial) + ".hex" # just parse node.parse(pkey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=0.00) sys.stdout.write('.') sys.stdout.flush()
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_put_parse4(self): timeoutSecs = 10 trial = 1 n = h2o.nodes[0] for x in xrange (2): # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz') key = n.put_file(csvPathname) key2 = key + "_" + str(x) + ".hex" parseKey = n.parse(key, key2) summaryResult = n.summary_page(key2) # remove bin_names because it's too big (256?) and bins # just touch all the stuff returned summary = summaryResult['summary'] print h2o.dump_json(summary) columnsList = summary['columns'] for columns in columnsList: N = columns['N'] name = columns['name'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] if 1==1: print "\n\n************************" print "name:", name print "type:", stype print "N:", N print "bin_size:", bin_size print "len(bin_names):", len(bin_names) print "len(bins):", len(bins) print "len(nbins):", len(nbins) # not done if enum if stype != "enum": smax = columns['max'] smin = columns['min'] percentiles = columns['percentiles'] thresholds = percentiles['thresholds'] values = percentiles['values'] mean = columns['mean'] sigma = columns['sigma'] if 1==1: print "len(max):", len(smax) print "len(min):", len(smin) print "len(thresholds):", len(thresholds) print "len(values):", len(values) print "mean:", mean print "sigma:", sigma ### print 'Trial:', trial sys.stdout.write('.') sys.stdout.flush() trial += 1
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) self.assertEqual(4.29, classification_error1) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) self.assertEqual(4.29, classification_error2) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_parse_cust(self): # run as user 0xcustomer to get access (with .json config and ssh key file specified) importFolderPath = '/mnt/0xcustomer-datasets' pollTimeoutSecs = 120 retryDelaySecs = 30 timeoutSecs = 300 (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*") importFileList = importResult['files'] importFailList = importResult['fails'] importKeyList = importResult['keys'] importDelList = importResult['dels'] if len(importDelList)!=0: raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList)) if len(importFileList)<MINFILES: raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList)) if len(importKeyList)<MINFILES: raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList)) if len(importFailList)!=0: raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList)) # only parse files with .csv or .tsv in their name (no dirs like that?) goodKeyList = [key for key in importKeyList if ('.csv' in key or '.tsv' in key)] trial = 0 # just do 1? for i, importKey in enumerate(random.sample(goodKeyList,3)): print "importKey:", importKey trial +=1 start = time.time() # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like # force header=0..should mean headers get treated as NAs parseResult = h2i.parse_only(pattern=importKey, header=0, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] origKey = parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=origKey) h2o_cmd.infoFromInspect(inspect, origKey) execExpr = 'newKey = '+origKey+'[1,1]' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) newParseKey = {'destination_key': 'newKey'} h2o_cmd.checkKeyDistribution() h2o.nodes[0].remove_key(key=origKey) # a key isn't created for a scalar # h2o.nodes[0].remove_key(key='newKey') self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
def checkScalarResult(resultExec, resultKey, allowEmptyResult=False): # make the common problems easier to debug h2o.verboseprint("checkScalarResult resultExec:", h2o.dump_json(resultExec)) if "funstr" not in resultExec: emsg = "checkScalarResult: 'funstr' missing" if "result" not in resultExec: emsg = "checkScalarResult: 'result' missing" if "scalar" not in resultExec: emsg = "checkScalarResult: 'scalar' missing" if "num_cols" not in resultExec: emsg = "checkScalarResult: 'num_cols' missing" if "num_rows" not in resultExec: emsg = "checkScalarResult: 'num_rows' missing" elif "cols" not in resultExec: emsg = "checkScalarResult: 'cols' missing" else: emsg = None num_cols = resultExec["num_cols"] num_rows = resultExec["num_rows"] cols = resultExec["cols"] # print "cols:", h2o.dump_json(cols) if emsg: print "\nKey: '" + str(resultKey) + "' resultExec:\n", h2o.dump_json(resultExec) sys.stdout.flush() raise Exception("exec result (resultExec) missing what we expected. Look at json above. " + emsg) if (cols and (not num_rows or num_rows == 0)) and not allowEmptyResult: print "resultExec[0]:", h2o.dump_json(resultExec) raise Exception( "checkScalarResult says 'cols' exist in exec json response," + " but num_rows: %s is 0 or None. Is that an expected 'empty' key state?" % num_rows + " Use 'allowEmptyResult if so." ) # Cycle thru rows and extract all the meta-data into a dict? # assume "0" and "row" keys exist for each list entry in rows # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here # cols may not exist..if the result was just scalar? if not cols: # just return the scalar result then scalar = resultExec["scalar"] if scalar is None: raise Exception("both cols and scalar are null: %s %s" % (cols, scalar)) checkForBadFP(scalar, json=resultExec) return scalar metaDict = cols[0] for key, value in metaDict.items(): print "Inspect metaDict:", key, value min_value = metaDict["min"] stype = metaDict["type"] # if it's an enum col, it's okay for min to be NaN .. checkForBadFP(min_value, nanOkay=stype == "Enum", json=metaDict) return min_value
def test_json_browse_both_exec(self): lenNodes = len(h2o.nodes) csvPathname = 'standard/covtype.data' hex_key = 'c.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "\nParse key is:", parseResult['destination_key'] ## h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1,54) row = random.randint(1,400000) execExpr = exprTemplate execExpr = re.sub('<col1>',str(colX),execExpr) execExpr = re.sub('<col2>',str(colX+1),execExpr) execExpr = re.sub('<n>',str(n),execExpr) execExpr = re.sub('<row>',str(row),execExpr) execExpr = re.sub('<keyX>',str(hex_key),execExpr) # pick a random node to execute it on randNode = random.randint(0,lenNodes-1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], execExpr=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': '4,100', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: print "\nfirst GBMResult:", h2o.dump_json(GBMResult) statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." # FIX! after gbm grid, have to get the model keys from the json? gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1==0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 1, 'r1', 0, 10, None), ] ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUll, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) print "" print "%30s" % "fpResult:", "%.15f" % fpResult ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # print "%30s" % "hex(bitResult):", hex(ullResult) ullResultList.append((ullResult, fpResult)) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll) print "%30s" % "expectedUll (0.16x):", "0x%0.16x %s" % (expectedUll, expectedUllAsDouble) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_GBM_cancel_model_reuse(self): importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult[ 'destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: # Only integer or enum/factor columns can be classified if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1, response + 1) kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) ignoreIndex = [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response ] # have to add 1 for col start with 1, now. plus the C xIgnore = ",".join(["C" + str(i + 1) for i in ignoreIndex]) params = { 'destination_key': None, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response + 1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 for i in range(5): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(j) # rjson error in poll_url: Job was cancelled by user! GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) h2o.check_sandbox_for_errors() # try ray's 'models' request to see if anything blows up modelsParams = { 'key': None, 'find_compatible_frames': 0, 'score_frame': None } modelsResult = h2o.nodes[0].models(timeoutSecs=10, **modelsParams) print "modelsResult:", h2o.dump_json(modelsResult) # have to pass the job id # for j in jobids: # h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.cancelAllJobs() # PUB-361. going to wait after cancel before reusing keys time.sleep(3) # am I getting a subsequent parse job cancelled? h2o_jobs.showAllJobs() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 0, 9.0, False, False), ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20.0, False, False), ("connect4.svm", "cB", 30, -1, 1.0, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False), ("mushrooms.svm", "cG", 30, 1, 2.0, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = float(inspectFirst['cols'][0]['min']) # print h2o.dump_json(inspectFirst['cols'][0]) imax = float(inspectFirst['cols'][0]['max']) if expectedCol0Min: self.assertEqual( imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: h2o_util.assertApproxEqual( imax, expectedCol0Max, tol=0.00000001, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect( inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for hex_keyB # h2o.nodes[0].remove_key(hex_key) start = time.time() hex_keyB = hex_key + "_B" parseResultB = h2o_cmd.parseResult = h2i.import_parse( path=csvDownloadPathname, schema='put', hex_key=hex_keyB) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_keyB) missingValuesListB = h2o_cmd.infoFromInspect( inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i, d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception( "testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True # ignore the variance diffs. reals mismatch when they're not? filtered = [ v for v in df.difference if not 'variance' in v ] self.assertLess(len(filtered), 3, msg="Want < 3, not %d differences between the two rfView json responses. %s" % \ (len(filtered), h2o.dump_json(filtered))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) h2o_util.assertApproxEqual( value_size_bytesA, value_size_bytesB, tol=0.00000001, msg= "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def log_iostats(self, initOnly=False): if ((self.snapshotTime - self.pollStats['lastJstackTime']) < self.IOSTATSINTERVAL): return DO_IO_RW = True DO_IOP = True DO_BLOCKED = False node = h2o.nodes[0] stats = node.iostatus() ### h2o.verboseprint("log_iostats:", h2o.dump_json(stats)) histogram = stats['histogram'] def log_window(k, w): ## in case the window disappears from h2o, print what's available with this line ## print k['window'] if k['window'] == w: i_o = k['i_o'] node = k['cloud_node_idx'] if k['r_w'] == 'read': r_w = 'rd' elif k['r_w'] == 'write': r_w = 'wr' else: r_w = k['r_w'] for l, v in k.iteritems(): fmt = "iostats: window{:<2d} node {:d} {:<4s} {:s} {:s} MB/sec: {:6.2f}" if 'peak' in l: ## logging.critical(fmt.format(w, node, i_o, r_w, "peak", (v/1e6))) pass if 'effective' in l: logging.critical( fmt.format(w, node, i_o, r_w, "eff.", (v / 1e6))) return True else: return False # not found if DO_IO_RW: print "\nlog_iotstats probing node:", str(node.addr) + ":" + str( node.port) found = False for k in histogram: ### print k found |= log_window(k, 60) ### log_window(30) if not found: print "iostats: desired window not found in histogram" # 1 5 60 300 available # we want to sort the results before we print them, so grouped by node if DO_IOP: iopList = [] raw_iops = stats['raw_iops'] ### print for k in raw_iops: ### print k node = k['node'] i_o = k['i_o'] r_w = k['r_w'] size = k['size_bytes'] blocked = k['blocked_ms'] duration = k['duration_ms'] if duration != 0: blockedPct = "%.2f" % (100 * blocked / duration) + "%" else: blockedPct = "no duration" iopMsg = "node: %s %s %s %d bytes. blocked: %s" % ( node, i_o, r_w, size, blockedPct) # FIX! don't dump for now iopList.append([node, iopMsg]) iopList.sort(key=lambda iop: iop[0]) # sort by node totalSockets = len(iopList) # something wrong if 0? if totalSockets == 0: print "WARNING: is something wrong with this io stats response?" print h2o.dump_json(stats) logging.critical("iostats: " + "Total sockets: " + str(totalSockets)) if DO_BLOCKED: for i in iopList: logging.critical("iostats:" + i[1]) # don't save anything self.save(iostats=True)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [ (1000000, 5, 'cD', 0, 320, 30), ] else: tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 80, 30), (1000000, 5, 'cD', 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append( (1000000, 5, 'cD', 0, 160, 30), ) tryList.append( (1000000, 5, 'cD', 0, 320, 30), ) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt-minInt)+1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(1): # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename) self.assertEqual(missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) #***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual(missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual(missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) print "execResult", h2o.dump_json(execResult) #***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual(result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult))) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_iostatus(self): # wait a bit first? time.sleep(5) # Ask each node for iostatus statistics for node in h2o.nodes: stats = node.iostatus() h2o.verboseprint(h2o.dump_json(stats)) histogram = stats['histogram'] # { # u'i_o': u'TCP', # u'peak_bytes_/_sec': 199690496.78920883, # u'effective_bytes_/_sec': 21850666.666666668, # u'r_w': u'write', # u'cloud_node_idx': 2, # u'window': 10 # } print "\nProbing node:", str(node.addr) + ":" + str(node.port) for k in histogram: ### print k if k['window'] == 10: i_o = k['i_o'] node = k['cloud_node_idx'] r_w = k['r_w'] for l, v in k.iteritems(): fmt = "iostats: window10 node {:d} {:s} {:s} {:s} MB/sec: {:.2f}" if 'peak' in l: print fmt.format(node, i_o, r_w, "peak", (v / 1e6)) if 'effective' in l: print fmt.format(node, i_o, r_w, "eff.", (v / 1e6)) # { # u'node': u'/192.168.0.37:54321', # u'i_o': u'TCP', # u'closeTime': '10:31:47:370', # u'r_w': u'write', # u'duration_ms': 4, # u'blocked_ns': 463132, # u'size_bytes': 65552 # } # we want to sort the results before we print them, so grouped by node iopList = [] raw_iops = stats['raw_iops'] print for k in raw_iops: ### print k node = k['node'] i_o = k['i_o'] r_w = k['r_w'] size = k['size_bytes'] blocked = k['blocked_ns'] duration = k['duration_ms'] * 1e6 # convert to ns if duration != 0: blockedPct = "%.2f" % (100 * blocked / duration) + "%" else: blockedPct = "no duration" iopMsg = "node: %s %s %s %d bytes. blocked: %s" % ( node, i_o, r_w, size, blockedPct) iopList.append([node, iopMsg]) iopList.sort(key=lambda iop: iop[0]) # sort by node totalSockets = len(iopList) # something wrong if 0? if totalSockets == 0: print "WARNING: is something wrong with this io stats response?" print h2o.dump_json(stats) print "iostats: Total sockets:", totalSockets for i in iopList: print "iostats:", i[1]
def kmeans_doit(self, csvFilename, bucket, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { 'k': 1, 'initialization': 'Furthest', 'destination_key': 'KMeansModel.hex', 'max_iter': 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], num_rows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? # inspect doesn't work # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key']) # KMeansModel = inspect['KMeansModel'] modelView = h2o.nodes[0].kmeans_model_view(model='KMeansModel.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_parse_bounds_csv_fvec(self): h2o.beta_features = True print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 50, 'cC', 300), (1000, 999, 'cC', 300), (1000, 1000, 'cA', 300), # (1000, 100000, 'cB', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below synSumList = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE********************** parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT******************* inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) iCols = inspect['cols'] iStats = [] for stats in iCols: iName = stats['name'] # just touching to make sure they are there iNaCnt = stats['naCnt'] iMin = float(stats['min']) iMax = float(stats['max']) iMean = float(stats['mean']) iStats.append({ 'name': iName, 'naCnt': iNaCnt, 'min': iMin, 'max': iMax, 'mean': iMean, }) # SUMMARY******************************** summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colCount, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) columnsList = summaryResult['summaries'] self.assertEqual( colCount, len(columnsList), msg= "generated %s cols (including output). summary has %s columns" % (colCount, len(columnsList))) c = 0 for column in columnsList: # get info from the inspect col for comparison iMin = iStats[c]['min'] iMax = iStats[c]['max'] iMean = iStats[c]['mean'] iNaCnt = iStats[c]['naCnt'] c += 1 colname = column['colname'] stats = column['stats'] stype = column['type'] hstep = column['hstep'] hbrk = column['hstep'] hstart = column['hstart'] smax = stats['maxs'] smin = stats['mins'] sd = stats['sd'] smean = stats['mean'] # no zeroes if enum, but we're not enum here zeros = stats['zeros'] self.assertEqual( iMin, smin[0], "inspect min %s != summary min %s" % (iMin, smin)) self.assertEqual( iMax, smax[0], "inspect max %s != summary max %s" % (iMax, smax)) self.assertEqual( iMean, smean, "inspect mean %s != summary mean %s" % (iMean, smean)) # no comparison for 'zeros' # now, also compare expected values if colname == "V1": synNa = 0 # can reverse-engineer the # of zeroes, since data is always 1 synSum = synSumList[ 1] # could get the same sum for all ccols synZeros = numRows - synSum synSigma = 0.50 synMean = (synSum + 0.0) / numRows synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif colname == "V2": synSum = 0 synSigma = 0 synMean = 0 if DO_NAN: synZeros = 0 synNa = numRows synMin = [] synMax = [] else: synZeros = numRows synNa = 0 synMin = [0.0] synMax = [0.0] # a single 1 in the last col elif colname == "V" + str(colCount - 1): # h2o puts a "V" prefix synNa = 0 synSum = synSumList[colCount - 1] synZeros = numRows - 1 # stddev.p # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx synMean = 1.0 / numRows # why does this need to be a 1 entry list synSigma = math.sqrt(pow((synMean - 1), 2) / numRows) print "last col with single 1. synSigma:", synSigma synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synNa = 0 synSum = 0 synZeros = numRows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] if DO_MEAN: self.assertAlmostEqual( float(smean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (colname, smean, synMean)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertTrue( smin >= synMin, msg='col %s min %s is not >= generated min %s' % (colname, smin, synMin)) self.assertTrue( smax <= synMax, msg='col %s max %s is not <= generated max %s' % (colname, smax, synMax)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if colname == "V65536" or colname == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json( columns) self.assertEqual( zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros %s' % (colname, zeros, synZeros))
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None): if not summaryResult: raise Exception("summaryResult is empty for infoFromSummary") if h2o.beta_features: # names = summaryResult['names'] # means = summaryResult['means'] summaries = summaryResult['summaries'] # what if we didn't get the full # of cols in this summary view? # I guess the test should deal with that if 1 == 0 and numCols and (len(summaries) != numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries))) for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] h2o_exec.checkForBadFP( nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype)) if stattype == 'Enum': cardinality = stats['cardinality'] h2o_exec.checkForBadFP( cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype)) else: mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] # check for NaN/Infinity in some of these # apparently we can get NaN in the mean for a numerica col with all NA? h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP( zeros, 'zeros for colname: %s stattype %s' % (colname, stattype)) if numRows and (nacnt == numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % ( colname, stattype) else: if not mins: print h2o.dump_json(column) # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)) print "Why is min[] empty for a %s col (%s) ? %s %s %s" % ( mins, stattype, colname, nacnt, numRows) if not maxs: # this is failing on maprfs best buy...why? (va only?) print h2o.dump_json(column) # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)) print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % ( maxs, stattype, colname, nacnt, numRows) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] if not noPrint: print "\n\n************************" print "colname:", colname print "coltype:", coltype print "nacnt:", nacnt print "stattype:", stattype if stattype == 'Enum': print "cardinality:", cardinality else: print "mean:", mean print "sd:", sd print "zeros:", zeros print "mins:", mins print "maxs:", maxs print "pct:", pct print "pctile:", pctile # histogram stuff print "hstart:", hstart print "hstep:", hstep print "hbrk:", hbrk print "hcnt:", hcnt else: summary = summaryResult['summary'] columnList = summary['columns'] # can't get the right number of columns in summary? have to ask for more cols (does va support > 1000) if 1 == 0 and numCols and (len(columnList) != numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(columnList))) for column in columnList: N = column['N'] # self.assertEqual(N, rowCount) name = column['name'] stype = column['type'] histogram = column['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] # if not noPrint: # for b in bin_names: # print "bin_name:", b bins = histogram['bins'] nbins = histogram['bins'] if not noPrint: print "\n\n************************" print "N:", N print "name:", name print "type:", stype print "bin_size:", bin_size print "len(bin_names):", len(bin_names), bin_names print "len(bins):", len(bins), bins print "len(nbins):", len(nbins), nbins # not done if enum if stype != "enum": zeros = column['zeros'] na = column['na'] maxs = column['max'] mins = column['min'] mean = column['mean'] sigma = column['sigma'] if not noPrint: print "zeros:", zeros print "na:", na print "maxs:", maxs print "mins:", mins print "mean:", mean print "sigma:", sigma if numRows and (na == numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % ( name, stype) else: if not mins: print h2o.dump_json(column) raise Exception( "Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stype, N, na, numRows)) if not maxs: print h2o.dump_json(column) # bestbuy dataset in maprfs is failing this ..for va only? not sure why. some nas? print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % ( maxs, stype, N, na, numRows) # sometimes we don't get percentiles? (if 0 or 1 bins?) if len(bins) >= 2: percentiles = column['percentiles'] thresholds = percentiles['thresholds'] values = percentiles['values'] if not noPrint: # h2o shows 5 of them, ordered print "len(max):", len(maxs), maxs print "len(min):", len(mins), mins print "len(thresholds):", len(thresholds), thresholds print "len(values):", len(values), values for v in values: # 0 is the most max or most min if not v >= mins[0]: m = "Percentile value %s should all be >= the min dataset value %s" % ( v, mins[0]) raise Exception(m) if not v <= maxs[0]: m = "Percentile value %s should all be <= the max dataset value %s" % ( v, maxs[0]) raise Exception(m)
def test_GBMGrid_basic_many(self): trainFilename = 'prostate.csv' train_key = 'prostate.hex' timeoutSecs = 300 csvPathname = "logreg/" + trainFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put') pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': '[ID]', # this has to have [] 'score_each_iteration': True, 'response_column': 'CAPSULE', 'do_classification': 1 if DO_CLASSIFICATION else 0, # 'balance_classes': # 'max_after_balance_size': 'ntrees': '8, 10', 'max_depth': '8, 9', 'min_rows': '1, 2', 'nbins': 40, 'learn_rate': '.1, .2', # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? 'variable_importance': False, # 'seed': } jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 for i in range(5): modelKey = 'GBMGrid_prostate_%s', i bmResult = h2o.n0.build_model(algo='gbm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') print "GBMResult:", h2o.dump_json(bm) job_key = bm.job_key model_key = bm.destination_key jobs.append((job_key, model_key)) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs for job_key, model_key in jobs: modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def pollWaitJobs(pattern=None, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=-1): anyBusy = True waitTime = 0 while (anyBusy): patternKeys = [] # timeout checking has to move in here now! just count loops anyBusy = False a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs) ## print "jobs_admin():", h2o.dump_json(a) jobs = a['jobs'] stall = -1 if stallForNJobs != -1: stall = 0 for j in jobs: stall += 1 if j['end_time'] == '' else 0 if stall <= stallForNJobs: break print str(stall), " jobs in progress.", "Waiting to poll on ", str( stallForNJobs), " jobs." for j in jobs: ### h2o.verboseprint(j) # save the destination keys for any GLMModel in progress if pattern and pattern in j['destination_key']: patternKeys.append(j['destination_key']) if j['end_time'] == '': anyBusy = True h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\ "destination_key:", j['destination_key'], \ "progress:", j['progress'], \ "cancelled:", j['cancelled'],\ "end_time:", j['end_time']) else: if stallForNJobs != -1: stall -= 1 if stall <= stallForNJobs: anyBusy = False break print str( stall ), " jobs in progress.", "Waiting to poll on ", str( stallForNJobs), " jobs." ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if (anyBusy and waitTime > timeoutSecs): print h2o.dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) return patternKeys
def test_quantile_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" h2o.beta_features = True for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_parse_bounds_libsvm (self): print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." ## h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 100000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] value_size_bytes = inspect['value_size_bytes'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) summary = summaryResult['summary'] columnsList = summary['columns'] self.assertEqual(colNumberMax+1, len(columnsList), msg="generated %s cols (including output). summary has %s columns" % (colNumberMax+1, len(columnsList))) for columns in columnsList: N = columns['N'] # self.assertEqual(N, rowCount) name = columns['name'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] # definitely not enums zeros = columns['zeros'] na = columns['na'] smax = columns['max'] smin = columns['min'] mean = columns['mean'] sigma = columns['sigma'] # a single 1 in the last col if name == "V" + str(colNumberMax): # h2o puts a "V" prefix synZeros = num_rows - 1 synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count) synMean = 1.0/num_rows # why does this need to be a 1 entry list synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif name == ("V1"): # can reverse-engineer the # of zeroes, since data is always 1 synSum = synColSumDict[1] # could get the same sum for all ccols synZeros = num_rows - synSum synSigma = 0.50 synMean = (synSum + 0.0)/num_rows synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synZeros = num_rows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] # print zeros, synZeros self.assertAlmostEqual(float(mean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertEqual(smin, synMin, msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if name == "V65536" or name == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json(columns) self.assertEqual(zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros)) self.assertEqual(stype, 'number', msg='col %s type %s is not equal to %s' % (name, stype, 'number')) # our random generation will have some variance for col 1. so just check to 2 places if synSigma: self.assertAlmostEqual(float(sigma), synSigma, delta=0.03, msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma)) if CHECK_MAX: self.assertEqual(smax, synMax, msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax)) self.assertEqual(0, na, msg='col %s num_missing_values %d should be 0' % (name, na))
def test_hdfs_cdh5_fvec(self): h2o.beta_features = True print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), ("covtype.169x.data", 600), ("prostate_long_1G.csv", 600), ("airlines_all.csv", 900), ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) d = h2i.import_only(path="datasets/*", schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time( ) - start, 'secs' start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" csvPathname = "tmp2/a%s.csv" % trial path = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time( ) - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time( ) - start, 'secs'
def test_kmeans_iris_fvec(self): csvFilename = 'iris.csv' csvPathname = 'iris/' + csvFilename print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key) k = 3 ignored_cols = 'C5' for trial in range(3): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'ignored_cols': ignored_cols, # ignore the output 'k': k, 'max_iter': 25, 'initialization': 'Furthest', 'destination_key': 'iris.hex', 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ # if ignored_cols isn't used # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) , # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) , # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) , ([ 5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999 ], 50, 15.240400000000003), ([ 5.901612903225807, 2.748387096774194, 4.393548387096775, 1.4338709677419357 ], 62, 39.82096774193549), ([ 6.8500000000000005, 3.073684210526315, 5.742105263157894, 2.0710526315789473 ], 38, 23.87947368421053), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) gs = h2o.nodes[0].gap_statistic(source=hex_key, ignored_cols=ignored_cols, k_max=k + 1) print "gap_statistic:", h2o.dump_json(gs) k_best = gs['gap_model']['k_best'] self.assertTrue(k_best != 0, msg="k_best shouldn't be 0: %s" % k_best)
def test_c5_KMeans_sphere_h1m(self): # a kludge h2o.setup_benchmark_log() if DO_REAL: csvFilename = 'syn_sphere_gen_real_1.49M.csv' else: csvFilename = 'syn_sphere_gen_h1m.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=300) numRows = inspect['numRows'] numCols = inspect['numCols'] summary = h2o_cmd.runSummary(key=parseResult['destination_key'], numRows=numRows, numCols=numCols, timeoutSecs=300) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 30, # 'normalize': 1, 'normalize': 0, # temp try 'initialization': 'Furthest', 'destination_key': 'junk.hex', # we get NaNs if whole col is NA 'ignored_cols': 'C1', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None): if not summaryResult: raise Exception("summaryResult is empty for infoFromSummary") summaries = summaryResult['summaries'] # what if we didn't get the full # of cols in this summary view? # I guess the test should deal with that if 1 == 0 and numCols and (len(summaries) != numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries))) for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] h2o_exec.checkForBadFP( nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype)) if stattype == 'Enum': cardinality = stats['cardinality'] h2o_exec.checkForBadFP( cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype)) else: mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] # check for NaN/Infinity in some of these # apparently we can get NaN in the mean for a numerica col with all NA? h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP( zeros, 'zeros for colname: %s stattype %s' % (colname, stattype)) if numRows and (nacnt == numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % ( colname, stattype) else: if not mins: print h2o.dump_json(column) # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)) print "Why is min[] empty for a %s col (%s) ? %s %s %s" % ( mins, stattype, colname, nacnt, numRows) if not maxs: # this is failing on maprfs best buy...why? (va only?) print h2o.dump_json(column) # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)) print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % ( maxs, stattype, colname, nacnt, numRows) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] if not noPrint: print "\n\n************************" print "colname:", colname print "coltype:", coltype print "nacnt:", nacnt print "stattype:", stattype if stattype == 'Enum': print "cardinality:", cardinality else: print "mean:", mean print "sd:", sd print "zeros:", zeros print "mins:", mins print "maxs:", maxs print "pct:", pct print "pctile:", pctile # histogram stuff print "hstart:", hstart print "hstep:", hstep print "hbrk:", hbrk print "hcnt:", hcnt
def test_get_cloud(self): # Ask each node for jstack statistics. do it 100 times SLEEP_AFTER = False GET_CLOUD_ALL_NODES = True TRIALMAX = 25 NODE = 1 PRINT_GET_CLOUD = True eList = [] xList = [] sList = [] for trial in range(TRIALMAX): print "Starting Trial", trial print "Just doing node[%s]" % NODE getCloudFirst = None for i,n in enumerate(h2o.nodes): if GET_CLOUD_ALL_NODES or i==NODE: # just track times on 0 # we just want the string start = time.time() getCloud = n.get_cloud() elapsed = int(1000 * (time.time() - start)) # milliseconds print "get_cloud completes to node", i, "in", "%s" % elapsed, "millisecs" getCloudString = json.dumps(getCloud) if PRINT_GET_CLOUD: print h2o.dump_json(getCloud) h2o.verboseprint(json.dumps(getCloud,indent=2)) if i==NODE: # just track times on 0 sList.append(len(getCloudString)) xList.append(trial) eList.append(elapsed) if SLEEP_AFTER: delay = 1 print "Sleeping for", delay, "sec" time.sleep(delay) if h2o.python_username=='kevin': import pylab as plt if eList: print "xList", xList print "eList", eList print "sList", sList plt.figure() plt.plot (xList, eList) plt.xlabel('trial') plt.ylabel('get_cloud completion latency (millisecs)') plt.title('Back to Back get_cloud requests to node['+str(NODE)+']') plt.draw() plt.figure() plt.plot (xList, sList) plt.xlabel('trial') plt.ylabel('node['+str(NODE)+'] get_cloud response string length') plt.title('Back to Back get_cloud requests to node['+str(NODE)+']') plt.title('Back to Back get_cloud') plt.draw() plt.show()
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? if h2o.beta_features: GLMModel = glm['glm_model'] else: GLMModel = glm['GLMModel'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x,w) and not allowFailWarning: if re.search(c,w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? if h2o.beta_features: GLMParams = GLMModel['glm'] else: GLMParams = GLMModel["GLMParams"] family = GLMParams["family"] if h2o.beta_features: # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] lambdas = GLMModel['lambdas'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1==0 and lambda_max <= lambdas[best_lambda_idx]: raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, lambdas[best_lambda_idx])) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(lambdas)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(lambdas))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] else: iterations = GLMModel['iterations'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) ) if h2o.beta_features: if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList else: # pop the first validation from the list if 'validations' not in GLMModel: raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel)) validationsList = GLMModel['validations'] # don't want to modify validationsList in case someone else looks at it validations = validationsList[0] # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) # not checked in v2? if not h2o.beta_features: if not 'xval_models' in validations: if n_folds > 1: raise Exception("No cross validation models returned. Asked for "+n_folds) else: xval_models = validations['xval_models'] if n_folds and n_folds > 1: if len(xval_models) != n_folds: raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds) else: # should be default 10? if len(xval_models) != 10: raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10") if h2o.beta_features: print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) else: print "GLMModel/validations" validations['err'] = h2o_util.cleanseInfNan(validations['err']) validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev']) validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev']) print "%15s %s" % ("err:\t", validations['err']) print "%15s %s" % ("nullDev:\t", validations['nullDev']) print "%15s %s" % ("resDev:\t", validations['resDev']) # threshold only there if binomial? # auc only for binomial if family=="binomial": print "%15s %s" % ("auc:\t", validations['auc']) if h2o.beta_features: best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break assert best_index!=None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] cms = submodels[0]['validation']['_cms'] assert best_index<len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", h2o.dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) else: print "%15s %s" % ("threshold:\t", validations['threshold']) if family=="poisson" or family=="gaussian": print "%15s %s" % ("aic:\t", validations['aic']) if not h2o.beta_features: if math.isnan(validations['err']): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err']) raise Exception(emsg) if math.isnan(validations['resDev']): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev']) raise Exception(emsg) # legal? if math.isnan(validations['nullDev']): pass # get a copy, so we don't destroy the original when we pop the intercept if h2o.beta_features: coefficients_names = GLMModel['coefficients_names'] idxs = submodels1['idxs'] column_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] if norm_beta and len(column_names)!=len(norm_beta): print len(column_names), len(norm_beta) raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta)) beta = submodels1['beta'] if len(column_names)!=len(beta): print len(column_names), len(beta) raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for n,b in zip(column_names, beta_used): coefficients[n] = b print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta print "intercept demapping info:", \ "column_names[-i]:", column_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names for i in idxs: if beta_used[i]==0.0: raise Exception("idxs shouldn't point to any 0 coefficients i: %s beta_used[i]:" (i, beta_used[i])) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? column_names.pop() else: if doNormalized: coefficients = GLMModel['normalized_coefficients'].copy() else: coefficients = GLMModel['coefficients'].copy() column_names = GLMModel['column_names'] # get the intercept out of there into it's own dictionary intercept = coefficients.pop('Intercept', None) print "First intercept:", intercept # have to skip the output col! get it from kwargs # better always be there! if h2o.beta_features: y = kwargs['response'] else: y = kwargs['y'] # the dict keys are column headers if they exist...how to order those? new: use the 'column_names' # from the response # Tomas created 'column_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # column_names is input only now..same for header or no header, or expanded enums for c in column_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(column_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) self.assertGreater(absXCoeff, 1e-26, ( "abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) )) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" )) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients)>0): maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey] minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients)>1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26" )) if h2o.beta_features: print "submodels1, run_time (milliseconds):", submodels1['run_time'] else: print "GLMModel model time (milliseconds):", GLMModel['model_time'] print "GLMModel validation time (milliseconds):", validations['val_time'] print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time'] # shouldn't have any errors h2o.check_sandbox_for_errors() return (warnings, cList, intercept)
def test_anomaly_uniform_w_NA(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, COLS, 'x.hex', 1, 20000), (ROWS, COLS, 'x.hex', -5000, 0), (ROWS, COLS, 'x.hex', -100000, 100000), (ROWS, COLS, 'x.hex', -1, 1), (ROWS, COLS, 'A.hex', 1, 100), (ROWS, COLS, 'A.hex', -99, 99), (ROWS, COLS, 'B.hex', 1, 10000), (ROWS, COLS, 'B.hex', -100, 100), (ROWS, COLS, 'C.hex', 1, 100000), (ROWS, COLS, 'C.hex', -101, 101), ] trial = 1 x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numRows = inspect["numRows"] numCols = inspect["numCols"] print "numRows:", numRows, "numCols:", numCols model_key = "m.hex" kwargs = { 'ignored_cols': None, 'response': numCols - 1, 'classification': 0, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, # 'loss' : 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 2.0, 'destination_key': model_key, # 'validation' : None, 'score_interval': 10000, 'autoencoder': 1, } timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end. took", time.time() - start, "seconds" kwargs = { 'destination_key': "a.hex", 'source': parseResult['destination_key'], 'dl_autoencoder_model': model_key, 'thresh': 1.0 } anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs) inspect = h2o_cmd.runInspect(None, "a.hex") numRows = inspect["numRows"] numCols = inspect["numCols"] print "anomaly: numRows:", numRows, "numCols:", numCols self.assertEqual(numCols, 1) # twice as many rows because of NA injection self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO)) # first col has the anomaly info. other cols are the same as orig data aSummary = h2o_cmd.runSummary(key='a.hex', cols=0) h2o_cmd.infoFromSummary(aSummary) print "anomaly:", h2o.dump_json(anomaly) trial += 1 h2i.delete_keys_at_all_nodes()
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, **kwargs): ## if h2o.beta_features: ## print "HACK: temporarily disabling Summary always in v2 import_parse" ## doSummary = False if not node: node = h2o.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, **kwargs) h2o.verboseprint("importPattern:", importPattern) h2o.verboseprint("importResult", h2o.dump_json(importResult)) parseResult = parse_only(node, importPattern, hex_key, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) h2o.verboseprint("parseResult:", h2o.dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up h2o.check_sandbox_for_errors() inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) if h2o.beta_features: numRows = inspect['numRows'] numCols = inspect['numCols'] else: numRows = inspect['num_rows'] numCols = inspect['num_cols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing h2o.check_sandbox_for_errors() return parseResult
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # colname, (min, 25th, 50th, 75th, max) (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)), (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)), (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None, None)), (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)), (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 # rangeMin and rangeMax are not used right now for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = False summary1Result = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("Summary1 summary1Result:", h2o.dump_json(summary1Result)) percentiles1 = summary1Result['summary']['columns'][0][ 'percentiles'] thresholds1 = percentiles1['thresholds'] values1 = percentiles1['values'] print "Summary1 thresholds", h2o_util.twoDecimals(thresholds1) print "Summary1 values", h2o_util.twoDecimals(values1) h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "Can't estimate the bin distribution" pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, )
start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)) elapsed = time.time() - start print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \ if __name__ == '__main__': h2o.unit_main()
def test_RF(self): paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) # self.assertEqual(4.29, classification_error1) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) # self.assertEqual(4.29, classification_error2) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception( "Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_exec2_log_like_R(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' # csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) xList = [] eList = [] fList = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) for trial in range(300): for execExpr in exprList: # put the trial number into the temp for uniqueness execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' c = h2o.nodes[0].get_cloud() c = c['nodes'] # print (h2o.dump_json(c)) k = [i['num_keys'] for i in c] v = [i['value_size_bytes'] for i in c] print "keys: %s" % " ".join(map(str, k)) print "value_size_bytes: %s" % " ".join(map(str, v)) # print "result:", result if DO_ORIG: if 'r1' in execExpr: xList.append(trial) eList.append(execTime) if 'log' in execExpr: fList.append(execTime) else: xList.append(trial) eList.append(execTime) fList.append(execTime) h2o.check_sandbox_for_errors() # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' if DO_ORIG: eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]' fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)' else: eLabel = 'time: Last.value.3 = r2+1' fLabel = 'time: Last.value.3 = r2+1' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def sub_c3_nongz_fvec_long(self, csvFilenameList): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # remove the output too! (378) ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1)**2 h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_summary2_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 0.0, 20000.0, ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]), (ROWS, 1, 'x.hex', -5000.0, 0.0, ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]), (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]), (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]), (ROWS, 1, 'A.hex', 1.0, 100.0, ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]), (ROWS, 1, 'A.hex', -99.0, 99.0, ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]), (ROWS, 1, 'B.hex', 1.0, 10000.0, ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]), (ROWS, 1, 'B.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), (ROWS, 1, 'C.hex', 1.0, 100000.0, ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]), (ROWS, 1, 'C.hex', -100.0, 100.0, ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # adjust the min/max depending on what the min/max actually was! # the expected 25%/50%/75% will still be off expected[1] = actualMin expected[5] = actualMax # max error = half the bin size? # use this for comparing to sklearn/sort expectedRange = expectedMax - expectedMin # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxDelta = 1 * expectedBin # how much error do we get in the random distribution gen? pain. It's a probability issue # smaller error likely with larger # of values. # the maxDelta used for the scipy/sort compare can be tighter, since it's looking # at actual data # this is way too coarse. can't get the distribution tight? maxDeltaPlusDistVariance = 10 * maxDelta # allow some fuzz in the comparison to scipy/sort maxDelta = 1.1 * maxDelta csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) quantile = 0.5 if DO_MEDIAN else .999 # get both answers since we feed both below for checking q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] # these should match exactly except for fp compare error? h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, msg='25th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, msg='50th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, msg='75th percentile is not approx. expected for generated uniform range %s %s' %\ (expectedMin, expectedMax)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin # too hard to estimate when there are ints now, due to floor/ceil int alignment? # don't check the last two bins for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]: # should we be able to check for a uniform distribution in the files? e = numRows / len(hcnt) self.assertAlmostEqual(b, rowCount / len(hcnt), delta=.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 # don't check if colname is empty..means it's a string and scipy doesn't parse right? if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxDelta, ) h2o.nodes[0].remove_all_keys()
def test_GBM_regression_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = { 'response': 'C55', # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0, 'validation': testKey, } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) print "gbmTrainView:", h2o.dump_json(gbmTrainView) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_hdfs_hdp2_1(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 1200), ("prostate_long_1G.csv", 200), ("airlines_all.csv", 1200), ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename # Do a simple typeahead check on the directory # typeaheadResult 2: { # "__meta": { # "schema_name": "TypeaheadV2", # "schema_type": "Iced", # "schema_version": 2 # }, # "limit": 2, # "matches": [ # "hdfs://172.16.2.186/datasets/15Mx2.2k.csv", # "hdfs://172.16.2.186/datasets/1Mx2.2k.NAs.csv" # ], # "src": "hdfs://172.16.2.186/datasets/" # } typeaheadPath = "hdfs://" + h2o.nodes[ 0].hdfs_name_node + "/datasets/" typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=2) print "typeaheadResult 2:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) == 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=0) print "typeaheadResult 0:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=None) print "typeaheadResult 0:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=-1) print "typeaheadResult -1:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time( ) - start, 'secs' pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList if DO_EXPORT: start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() csvPathname = "tmp2/a%s.%s.csv" % (trial, username) # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://" + h2o.nodes[ 0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time( ) - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time( ) - start, 'secs'