def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs): # no regularization kwargs['alpha'] = 0 kwargs['lambda'] = 0 kwargs['response'] = 'CAPSULE' glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, **kwargs) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response # the first submodel is the right one, if onely one lambda is provided as a parameter above glm_model = glmResult['glm_model'] submodels = glm_model['submodels'][0] validation = submodels['validation'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] errors = [] # FIX! our null deviance doesn't seem to match h2o.verboseprint("Comparing:", null_deviance, e_ndev) # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): # errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev)) # FIX! our res deviance doesn't seem to match h2o.verboseprint("Comparing:", residual_deviance, e_rdev) # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): # errors.append('ResDeviance: %f != %s' % (e_rdev,resDev)) # FIX! we don't have an AIC to compare? return errors
def test_exec_assign(self): ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) trial = 0 while (trial < 200): for execExpr in initList: if (trial==100): print "\nNow switching between nodes" if (trial < 100): nodeX = 0 else: nodeX = random.randint(0,lenNodes-1) ### print nodeX resultKey = "Result" + str(trial % period) execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey=resultKey, timeoutSecs=4) ### print "\nexecResult:", execResultInspect print "trial: #" + str(trial), min_value, execExpr h2o.verboseprint("min_value: ", min_value, "trial:", trial) self.assertEqual(float(min_value), float((trial % period) - 1), "exec constant assigns don't seem to be getting done and visible to Inspect") sys.stdout.write('.') sys.stdout.flush() ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") trial += 1
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult["destination_key"] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! type = inspectGG["type"] if "unparsed" in type: print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG["response"] # dict ### rows = inspectGG['rows'] value_size_bytes = inspectGG["value_size_bytes"] model0 = glmGridResult["models"][0] alpha = model0["alpha"] area_under_curve = model0["area_under_curve"] error_0 = model0["error_0"] error_1 = model0["error_1"] key = model0["key"] print "best GLM model key:", key glm_lambda = model0["lambda"] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, key) h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM)) simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5", timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM validation = glmScore['validation'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1) else: self.validations1 = copy.deepcopy(validation)
def tryThemAll(self,set,rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows,tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) parseResult = h2i.import_parse(path=csvPathname, schema='local', noPrint=not h2o.verbose) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] self.assertEqual(num_cols, 4, "Parsed wrong number of cols: %s" % num_cols) self.assertEqual(num_rows, 29, "Parsed wrong number of rows: %s" % num_rows) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=1.0, noPrint=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_import_file(self): timeoutSecs = 500 cAll = [ 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', ] # pop open a browser on the cloud # h2b.browseTheCloud() for c in cAll: for i in range(10): # race between remove and import? csvPathname = h2o.find_file('smalldata/jira/v-3.csv') h2o.nodes[0].remove_all_keys() importResult = h2o.nodes[0].import_files(csvPathname, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(importResult)) files = importResult['files'] keys = importResult['keys'] fails = importResult['fails'] dels = importResult['dels'] if len(files) == 0: raise Exception("empty files: %s after import" % files) if len(keys) == 0: raise Exception("empty keys: %s after import" % keys) if len(fails) != 0: raise Exception("non-empty fails: %s after import" % fails) if len(dels) != 0: raise Exception("non-empty dels: %s after import" % dels)
def exec_list(exprList, lenNodes, csvFilename, key2): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0,lenNodes-1) colX = random.randint(1,54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1,400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result"+str(trial)+".hex", timeoutSecs=60) eri0 = execResultInspect[0] eri1 = execResultInspect[1] columns = eri0.pop('cols') columnsDict = columns[0] print "\nexecResult columns[0]:", h2o.dump_json(columnsDict) print "\nexecResult [0]:", h2o.dump_json(eri0) print "\nexecResult [1] :", h2o.dump_json(eri1) min = columnsDict["min"] h2o.verboseprint("min: ", min, "trial:", trial) ### self.assertEqual(float(min), float(trial),"what can we check here") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def parseS3File(self, s3bucket, filename, **kwargs): start = time.time() parseKey = h2o_cmd.parseS3File(bucket=s3bucket, filename=filename, **kwargs) parse_time = time.time() - start h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time)) parseKey['python_call_timer'] = parse_time return parseKey
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] model0 = glmGridResult['models'][0] alpha = model0['alpha'] area_under_curve = model0['area_under_curve'] error_0 = model0['error_0'] error_1 = model0['error_1'] model_key = model0['key'] print "best GBM model key:", model_key glm_lambda = model0['lambda'] # now indirect to the GBM result/model that's first in the list (best) inspectGBM = h2o_cmd.runInspect(None, model_key) h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM)) simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR ) h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30 ) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds" print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def tryThemAll(self,set,rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows,tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in self.tokenChangeDict[tokenCase][0]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, noPrint=not h2o.verbose) if DO_RF: h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=30, retryDelaySecs=0.1) h2o.verboseprint("Set", set) sys.stdout.write('.') sys.stdout.flush()
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def exec_expr_list_across_cols(lenNodes, exprList, keyX, minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True): colResultList = [] for colX in range(minCol, maxCol): for i, exprTemplate in enumerate(exprList): # do each expression at a random node, to facilate key movement # UPDATE: all execs are to a single node. No mixed node streams # eliminates some store/store race conditions that caused problems. # always go to node 0 (forever?) if lenNodes is None: execNode = 0 else: ### execNode = random.randint(0,lenNodes-1) ### print execNode execNode = 0 execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX) if incrementingResult: # the Result<col> pattern resultKey = "Result"+str(colX) else: # assume it's a re-assign to self resultKey = keyX # kbn # v1 # execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, resultKey, timeoutSecs) # v2 execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, None, timeoutSecs) print "\nexecResult:", h2o.dump_json(execResultInspect) execResultKey = execResultInspect[0]['key'] # v2: Exec2 'apply' can have no key field? (null) maybe just use keyX then if execResultKey: resultInspect = h2o_cmd.runInspect(None, execResultKey) else: resultInspect = h2o_cmd.runInspect(None, keyX) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # min is keyword. shouldn't use. if incrementingResult: # a col will have a single min min_value = checkScalarResult(execResultInspect, resultKey) h2o.verboseprint("min_value: ", min_value, "col:", colX) print "min_value: ", min_value, "col:", colX else: min_value = None sys.stdout.write('.') sys.stdout.flush() ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation if (h2o.check_sandbox_for_errors()): raise Exception( "Found errors in sandbox stdout or stderr, on trial #%s." % trial) print "Column #", colX, "completed\n" colResultList.append(min_value) return colResultList
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] # FIX! does error_0/1 only exist for binomial? for m, model in enumerate(glmGridResult['models']): alpha = model['alpha'] area_under_curve = model['area_under_curve'] # FIX! should check max error? error_0 = model['error_0'] error_1 = model['error_1'] model_key = model['key'] print "#%s GLM model key: %s" % (m, model_key) glm_lambda = model['lambda'] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key']) h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM)) g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs) return g
def test_1ktrees_job_cancel_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling" for trial in range (1,20): h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2o_jobs.cancelAllJobs(timeoutSecs=10)
def test_rf_1ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [500]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_exec_filter_slice2(self): timeoutSecs = 10 csvFilename = "covtype.data" csvPathname = 'UCI/UCI-large/covtype/covtype.data' hex_key = 'c' parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['desination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) for trial in range(10): print "Doing the execs in order, to feed filters into slices" nodeX = 0 for exprTemplate in exprList: execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, keyX=hex_key, m=2) time.sleep(2) h2o.check_sandbox_for_errors() execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result.hex", timeoutSecs=4) print "min_value:", min_value, "execExpr:", execExpr h2o.verboseprint("min: ", min_value, "trial:", trial)
def test_GLM2_model_key_unique(self): h2o.beta_features = True modelKeyDict = {} for trial in range (1,5): csvPathname = 'iris/iris2.csv' start = time.time() # make sure each parse is unique dest key (not in use hex_key = "iris2_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) y = 4 execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # h2o.py now sets destination_key for a fixed default model name, # we want h2o to create model names for this test, so use none here kwargs = {'destination_key': None, 'response':4, 'family': 'gaussian'} glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, noPoll=True, **kwargs ) print "GLM #%d" % trial, "started on ", csvPathname, 'took', time.time() - start, 'seconds' model_key = glmResult['destination_key'] print "GLM model_key:", model_key if model_key in modelKeyDict: raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key])) modelKeyDict[model_key] = trial # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def tryThemAll(self, set, rows, enumsOnly=False): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first if enumsOnly: tcd = self.tokenChangeDict else: tcd = self.tokenChangeDictEnumsOnly for tokenCase in range(len(tcd)): newRows1 = self.changeTokens(rows, tokenCase, tcd) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in self.tokenChangeDict[tokenCase]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, noPrint=not h2o.verbose) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange (1,3,1): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds'
def test_exec2_result_race(self): ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) # zero the list of Results using node[0] # FIX! is the zerolist not eing seen correctl? is it not initializing to non-zero? for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=20) ### print "\nexecResult:", execResult trial = 0 while (trial < 200): for execExpr in exprList: # for the first 100 trials: do each expression at node 0, # for the second 100 trials: do each expression at a random node, to facilate key movement # FIX! there's some problem with the initList not taking if rotated amongst nodes? if (trial < 100): nodeX = 0 else: nodeX = random.randint(0,lenNodes-1) resultKey = "Result.hex" execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey=resultKey, timeoutSecs=20) print min_value, execExpr h2o.verboseprint("min_value: ", min_value, "trial:", trial) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") trial += 1
def runRFOnly(node=None, parseKey=None, trees=5, timeoutSecs=20, retryDelaySecs=2, rfview=True, noise=None, noPrint=False, **kwargs): if not parseKey: raise Exception('No parsed key for RF specified') if not node: node = h2o.nodes[0] #! FIX! what else is in parseKey that we should check? h2o.verboseprint("runRFOnly parseKey:", parseKey) Key = parseKey['destination_key'] rf = node.random_forest(Key, trees, timeoutSecs, **kwargs) # FIX! check all of these somehow? # if we model_key was given to rf via **kwargs, remove it, since we're passing # model_key from rf. can't pass it in two places. (ok if it doesn't exist in kwargs) data_key = rf['data_key'] kwargs.pop('model_key',None) model_key = rf['model_key'] rfCloud = rf['response']['h2o'] # same thing. if we use random param generation and have ntree in kwargs, get rid of it. kwargs.pop('ntree',None) # this is important. it's the only accurate value for how many trees RF was asked for. ntree = rf['ntree'] # /ip:port of cloud (can't use h2o name) rfClass= rf['response_variable'] rfViewResult = None if rfview: rfViewResult = runRFView(node, data_key, model_key, ntree, timeoutSecs, retryDelaySecs, noise=noise, noPrint=noPrint, **kwargs) return rfViewResult
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" # save and restore the model h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1) # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not) h2o.nodes[0].load_model(path=modelPathname) start = time.time() glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM if self.glmScore1: h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1) else: self.glmScore1 = copy.deepcopy(glmScore)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def file_append(infile, outfile): h2o.verboseprint("\nAppend'ing", infile, "to", outfile) start = time.time() in_file = open(infile,'rb') out_file = open(outfile,'a') out_file.write(in_file.read()) out_file.close() h2o.verboseprint("\nAppend took", (time.time() - start), "secs")
def find_key(pattern=None): found = None kwargs = {'filter': pattern} storeViewResult = h2o.nodes[0].store_view(**kwargs) keys = storeViewResult['keys'] if len(keys) == 0: return None if len(keys) > 1: h2o.verboseprint( "Warning: multiple imported keys match the key pattern given, Using: %s" % keys[0]['key']) return keys[0]['key']
def exec_expr(node, execExpr, resultKey="Result.hex", timeoutSecs=10, ignoreH2oError=False): start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 resultExec = h2o_cmd.runExecOnly(node, expression=execExpr, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError) h2o.verboseprint(resultExec) h2o.verboseprint('exec took', time.time() - start, 'seconds') ### print 'exec took', time.time() - start, 'seconds' h2o.verboseprint("\nfirst look at the default Result key") # new offset=-1 to get the metadata? defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) checkScalarResult(defaultInspectM1, "Result.hex") h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) min_value = checkScalarResult(resultInspectM1, resultKey) return resultInspectM1, min_value
def test_rf_1ktrees_job_cancel_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() # without rfview, do we get the 'first" rf json? rfv = h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' # rf_model = rfv['drf_model'] # data_key = rf_model['_dataKey'] # model_key = rf_model['_key'] data_key = rfv['source']['_key'] model_key = rfv['destination_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a)
def check_cloud_and_setup_next(): h2b.browseTheCloud() h2o.verify_cloud_size() h2o.check_sandbox_for_errors() print "Tearing down cloud of size", len(h2o.nodes) h2o.tear_down_cloud() # this will delete the flatfile in sandbox h2o.clean_sandbox() # wait to make sure no sticky ports or anything os-related # so let's expand the delay if larger number of jvms # 1 second per node seems good h2o.verboseprint("Waiting", node_count, "seconds to avoid OS sticky port problem") time.sleep(node_count)
def test_A_putfile_to_all_nodes(self): cvsfile = h2o.find_file(file_to_put()) origSize = h2o.get_file_size(cvsfile) # Putfile to each node and check the returned size for node in h2o.nodes: sys.stdout.write('.') sys.stdout.flush() h2o.verboseprint("put_file:", cvsfile, "node:", node, "origSize:", origSize) key = node.put_file(cvsfile) resultSize = node.inspect(key)['value_size_bytes'] self.assertEqual(origSize, resultSize)
def test_1ktrees_job_cancel_many_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range(1, 5): # random 0 or 1 delay delay = random.uniform(0, 1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_rf_1ktrees_job_cancel_3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 20): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' # FIX! need to get more intelligent here time.sleep(1) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # "destination_key": "pytest_model", # FIX! using 'key': 'pytest_model" with no time delay causes a failure time.sleep(1) jobsList = a['jobs'] for j in jobsList: b = h2o.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", h2o.dump_json(b)
def tryThemAll(self, set, rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows, tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1, sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname, newRows2, eol) # use the single_quotes param if single quote in the # tokenCase (creates token wrapper) if "'" in self.tokenChangeDict[tokenCase][0]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='local', single_quotes=single_quotes, noPrint=not h2o.verbose) inspect = h2o_cmd.runInspect( key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] self.assertEqual( num_cols, 4, "Parsed wrong number of cols: %s" % num_cols) self.assertEqual( num_rows, 29, "Parsed wrong number of rows: %s" % num_rows) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=1.0, noPrint=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def changeTokens(self, rows, tokenCase): [cOpen, cClose] = self.tokenChangeDict[tokenCase] newRows = [] for r in rows: # don't quote lines that start with # # can quote lines start with some spaces or tabs? maybe comment = re.match(r'^[ \t]*#', r) empty = re.match(r'^$', r) if not (comment or empty): r = re.sub('^', cOpen, r) r = re.sub('\|', cClose + '|' + cOpen, r) r = re.sub('$', cClose, r) h2o.verboseprint(r) newRows.append(r) return newRows
def runRFOnly(node=None, parseKey=None, trees=5, timeoutSecs=20, retryDelaySecs=2, rfView=True, noise=None, noPrint=False, **kwargs): if not parseKey: raise Exception('No parsed key for RF specified') if not node: node = h2o.nodes[0] #! FIX! what else is in parseKey that we should check? h2o.verboseprint("runRFOnly parseKey:", parseKey) Key = parseKey['destination_key'] rf = node.random_forest(Key, trees, timeoutSecs, **kwargs) if h2o.beta_features and rfView == False: # just return for now return rf # FIX! check all of these somehow? # if we model_key was given to rf via **kwargs, remove it, since we're passing # model_key from rf. can't pass it in two places. (ok if it doesn't exist in kwargs) data_key = rf['data_key'] kwargs.pop('model_key', None) model_key = rf['model_key'] rfCloud = rf['response']['h2o'] # same thing. if we use random param generation and have ntree in kwargs, get rid of it. kwargs.pop('ntree', None) # this is important. it's the only accurate value for how many trees RF was asked for. ntree = rf['ntree'] response_variable = rf['response_variable'] if rfView: # ugly..we apparently pass/use response_variable in RFView, gets passed thru kwargs here # print kwargs['response_variable'] rfViewResult = runRFView(node, data_key, model_key, ntree, timeoutSecs, retryDelaySecs, noise=noise, noPrint=noPrint, **kwargs) return rfViewResult else: return rf
def scoreRF(scoreParseKey, trainResult, **kwargs): # Run validation on dataset rfModelKey = trainResult['model_key'] ntree = trainResult['ntree'] start = time.time() data_key = scoreParseKey['destination_key'] scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def testCloud(self): base_port = 54300 ports_per_node = 2 for tryNodes in range(2,8): sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o.build_cloud(use_this_ip_addr="127.0.0.1", base_port=base_port, node_count=tryNodes, timeoutSecs=30, retryDelaySecs=2, java_heap_GB=1) print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) h2o.verboseprint(h2o.nodes) h2o.verify_cloud_size() h2o.tear_down_cloud(h2o.nodes)
def wait_for_live_port(ip, port, retries=3): h2o.verboseprint("Waiting for {0}:{1} {2}times...".format( ip, port, retries)) if not port_live(ip, port): count = 0 while count < retries: if port_live(ip, port): count += 1 else: count = 0 time.sleep(1) dot() if not port_live(ip, port): raise Exception( "[h2o_cmd] Error waiting for {0}:{1} {2}times...".format( ip, port, retries))
def parseFile(self, bucket, pathname, timeoutSecs, header, **kwargs): # this can get redirected if USE_LOCAL: schema = None else: schema = 's3n' start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=pathname, schema='local', timeoutSecs=180) parse_time = time.time() - start h2o.verboseprint("parse took {0} sec".format(parse_time)) parseResult['python_call_timer'] = parse_time return parseResult
def test_Cloud(self): # FIX! weird timeout H2O exceptions with >8? maybe shouldn't # don't know if we care base_port = 54300 ports_per_node = 2 for tryNodes in range(2,17): h2o.verboseprint("Trying cloud of", tryNodes) sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o.build_cloud(tryNodes, base_port=base_port, retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1) print "Built cloud of %d in %d s" % (tryNodes, (time.time() - start)) h2o.verify_cloud_size() h2o.tear_down_cloud()
def test_F_no_mc_loop(self): print "\nwith flatfile, with multicast disabled, and RF, 5 trials" allAcceptIptables() multicastDropReceiveIptables() showIptables() csvPathname = h2o.find_file('smalldata/poker/poker1000') for x in range(1, 5): h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname) h2o.tear_down_cloud() h2o.verboseprint("Waiting", nodes_per_host, "seconds to avoid OS sticky port problem") time.sleep(nodes_per_host) print "Trial", x sys.stdout.write('.') sys.stdout.flush()
def test_rand_inspect(self): ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/'+ csvFilename) print "\n" + csvPathname parseKey = h2o_cmd.parseFile(None, csvPathname, key=csvFilename, timeoutSecs=10) destination_key = parseKey['destination_key'] print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX,destination_key,offset,view,inspect=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) # FIX! get min/max/mean/variance for a col too? constantNames = [ 'num_cols', 'num_rows', ] if inspect is not None: for i in constantNames: self.assertEqual(inspect[i], inspectNew[i]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0,destination_key,0,1) h2o.verboseprint(h2o.dump_json(origInspect)) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for i in range (1000): # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0,lenNodes-1) print "nodeX:", nodeX, "offset:", offset, "view:", view inspect_and_check(nodeX,destination_key,offset,view,origInspect) # do it again, once in a while r = random.randint(0,10) if (r==0): inspect_and_check(nodeX,destination_key,offset,view,origInspect)
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j, k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual( totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual( elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def test_F_no_mc_loop(self): print "\nwith flatfile, with multicast disabled, and RF, 5 trials" allAcceptIptables() multicastDropReceiveIptables() showIptables() for x in range(1,5): h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=50, timeoutSecs=10) h2o.tear_down_cloud() h2o.verboseprint("Waiting", nodes_per_host, "seconds to avoid OS sticky port problem") time.sleep(nodes_per_host) print "Trial", x sys.stdout.write('.') sys.stdout.flush()
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange(1, 20, 1): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() # rfview=False used to inhibit the rfview completion h2o_cmd.runRFOnly(parseKey=parseKey, trees=trial, depth=2, rfview=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' # FIX! need to get more intelligent here time.sleep(1) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # "destination_key": "pytest_model", # FIX! using 'key': 'pytest_model" with no time delay causes a failure time.sleep(1) jobsList = a['jobs'] for j in jobsList: b = h2o.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", h2o.dump_json(b)
def simpleCheckKMeans(self, kmeans, **kwargs): ### print h2o.dump_json(kmeans) warnings = None if 'warnings' in kmeans: warnings = kmeans['warnings'] # catch the 'Failed to converge" for now x = re.compile("[Ff]ailed") for w in warnings: print "\nwarning:", w if re.search(x, w): raise Exception(w) # Check other things in the json response dictionary 'kmeans' here if h2o.beta_features: destination_key = kmeans['model']['_key'] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans else: destination_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=destination_key) if h2o.beta_features: model = kmeansResult['model'] clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] else: h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult)) model = kmeansResult['KMeansModel'] clusters = model['clusters'] error = model["error"] for i, c in enumerate(clusters): for n in c: if math.isnan(float(n)): raise Exception("center", i, "has NaN:", n, "center:", c) # shouldn't have any errors h2o.check_sandbox_for_errors() return warnings
def test(n, tries=None): rfView = n.random_forest_view(data_key, model_key, timeoutSecs, noise=noise, **kwargs) status = rfView['response']['status'] numberBuilt = rfView['trees']['number_built'] if status == 'done': if numberBuilt != ntree: raise Exception("RFview done but number_built!=ntree: %s %s", numberBuilt, ntree) return True if status != 'poll': raise Exception('Unexpected status: ' + status) progress = rfView['response']['progress'] progressTotal = rfView['response']['progress_total'] # want to double check all this because it's new # and we had problems with races/doneness before errorInResponse = \ numberBuilt<0 or ntree<0 or numberBuilt>ntree or \ progress<0 or progressTotal<0 or progress>progressTotal or \ ntree!=rfView['ntree'] ## progressTotal!=ntree or # rfView better always agree with what RF ntree was if errorInResponse: raise Exception("\nBad values in response during RFView polling.\n" + "progress: %s, progressTotal: %s, ntree: %s, numberBuilt: %s, status: %s" % \ (progress, progressTotal, ntree, numberBuilt, status)) # don't print the useless first poll. # UPDATE: don't look for done. look for not poll was missing completion when looking for done if (status == 'poll'): if numberBuilt == 0: h2o.verboseprint(".") else: h2o.verboseprint( "\nRFView polling #", tries, "Status: %s. %s trees done of %s desired" % (status, numberBuilt, ntree)) return (status != 'poll')
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' model_key = rfResult['model_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key=model_key) print "jobs_cancel():", h2o.dump_json(b)
def trainRF(trainParseResult, scoreParseResult=None, **kwargs): # Train RF start = time.time() if scoreParseResult: trainResult = h2o_cmd.runRF( parseResult=trainParseResult, validation=scoreParseResult['destination_key'], **kwargs) else: trainResult = h2o_cmd.runRF(parseResult=trainParseResult, **kwargs) rftime = time.time() - start h2o.verboseprint("RF train results: ", trainResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) trainResult['python_call_timer'] = rftime return trainResult
def pollWaitJobs(pattern=None, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None): anyBusy = True waitTime = 0 while (anyBusy): # timeout checking has to move in here now! just count loops anyBusy = False a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs) ## print "jobs_admin():", h2o.dump_json(a) jobs = a['jobs'] patternKeys = [] for j in jobs: ### h2o.verboseprint(j) # save the destination keys for any GLMModel in progress if pattern and pattern in j['destination_key']: patternKeys.append(j['destination_key']) if j['end_time'] == '': anyBusy = True h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\ "destination_key:", j['destination_key'], \ "progress:", j['progress'], \ "cancelled:", j['cancelled'],\ "end_time:", j['end_time']) ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if (anyBusy and waitTime > timeoutSecs): print h2o.dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) return patternKeys
def test_exec2_fast_locks_overlap(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): # "grid": { # "destination_keys": [ # "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_2" # ] # }, destination_key = glmGridResult['grid']['destination_keys'][0] inspectGG = h2o.nodes[0].glm_view(destination_key) models = inspectGG['glm_model']['submodels'] h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0])) g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs) # just to get some save_model testing for i,m in enumerate(glmGridResult['grid']['destination_keys']): print "Saving model", m, "to model"+str(i) h2o.nodes[0].save_model(model=m, path='model'+str(i), force=1) return g
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key = 'iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range(1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr = "%s[,%s]=(%s[,%s]==%s)" % (hex_key, y + 1, hex_key, y + 1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_exec_assign(self): ### h2b.browseTheCloud() trial = 0 while (trial < 200): for execExpr in initList: # always a one node stream. shouldn't fail nodeX = 0 resultKey="Result" + str(trial%period) execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey=resultKey, timeoutSecs=4) print "trial: #" + str(trial), min_value, execExpr h2o.verboseprint("min: ", min_value, "trial:", trial) self.assertEqual(float(min_value), float((trial % period) - 1), "exec constant assigns don't seem to be getting done and visible to Inspect") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") trial += 1
def parseFile(self, s3bucket, localbucket, pathname, timeoutSecs, header, **kwargs): if USE_LOCAL: schema = "/" bucket = localbucket URI = schema + bucket + pathname importResult = h2o.nodes[0].import_files(URI) else: schema = "s3n://" bucket = s3bucket URI = schema + bucket + pathname importResult = h2o.nodes[0].import_hdfs(URI) start = time.time() # pattern match, so nfs and s3n case is the same parseKey = h2o.nodes[0].parse("*" + pathname, timeoutSecs=timeoutSecs, header=header) parse_time = time.time() - start h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time)) parseKey['python_call_timer'] = parse_time return parseKey