def completionHack(jobKey, modelKey): h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) print "FIX! how do we get the GLM result" # hack it! params = {'job_key': jobKey, 'destination_key': modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMProgressPage2.json", params=params) print "GLM result from completion_redirect:", h2o.dump_json(a)
def test_rf_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(1): start = time.time() kwargs = {} # FIX! what model keys do these get? rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\ timeoutSecs=300, noPoll=True, **kwargs) rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=30, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for rfView in rfViewInitial: print "Checking completed job, with no polling:", rfView a = h2o.nodes[0].poll_url(rf['response'], noPoll=True) h2o_rf.simpleCheckRFView(None, a)
def test_RF_poker100(self): MISSING_RESPONSE = False DO_MODEL_INSPECT = False trees = ",".join(map(str,range(10,50,2))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append( (job_key, model_key) ) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key) print "speedrf grid result for %s:", h2o.dump_json(gridResult) print "speedrf grid result errors:", gridResult['prediction_errors'] for i,j in enumerate(gridResult['jobs']): if DO_MODEL_INSPECT: print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key'])) else: # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) print "model:", h2o.dump_json(model)
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=30, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] ntree = rfView['ntree'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_GLM2grid_covtype_many(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_GLM_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(40): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling:", glm a = h2o.nodes[0].poll_url(glm['response'], noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_sequential_diff_dest(self): csvPathname = 'poker/poker-hand-testing.data' for trials in range(30): hex_key = csvPathname + "_" + str(trials) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
def test_GLM_big1_nopoll(self): csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(10): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm) a = h2o.nodes[0].poll_url(glm, noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_parse_airline_multi_hdfs_many(self): # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_RF_poker100(self): MISSING_RESPONSE = True trees = ",".join(map(str, range(1, 4))) trees = "1,2" timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append((job_key, model_key)) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view( job_key=job_key, destination_key=model_key) # h2o_rf.showRFGridResults(GBMResult, 15) print "speedrf grid result for %s:", model_key, h2o.dump_json( gridResult)
def test_parse_unlock(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", 600), ("mnist_testing.csv.gz", 600), ] trial = 0 allDelta = [] for (csvFilename, timeoutSecs) in csvFilelist: hex_key = csvFilename + "_" + str(trial) + ".hex" # can't import the dir again while the src file is being parsed parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, intermediateResults=DO_INTERMEDIATE_RESULTS, noPoll=True) trial += 1 # can't unlock while jobs are running # Session WARN: java.lang.UnsupportedOperationException: Cannot unlock all keys since locking jobs are still running. h2j.pollWaitJobs() h2o.n0.unlock()
def test_B_kmeans_benign(self): h2o.beta_features = True csvPathname = "logreg" csvFilename = "benign.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols' : None, 'destination_key': 'benign_k.hex', 'max_iter': 50, 'seed': 265211114317615310, } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_GLM_big1_nopoll(self): csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(10): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm) a = h2o.nodes[0].poll_url(glm, noPoll=True) h2o_glm.simpleCheckGLM(self, a, 'C58', **kwargs)
def test_C_kmeans_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex") h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols': "ID", 'destination_key': 'prostate_k.hex', 'max_iter': 100, 'seed': 265211114317615310 } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_GBM_parseTrain(self): bucket = 'home-0xdiag-datasets' files = [('standard', 'covtype.data', 'covtype.hex', 1800, 54) ] for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate':.1, 'ntrees':1, 'max_depth':1, 'min_rows':1, 'response':response } print "Using these parameters for GBM: ", params kwargs = params.copy() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs']
def test_parse_unlock(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", 600), ("mnist_testing.csv.gz", 600), ] trial = 0 allDelta = [] for (csvFilename, timeoutSecs) in csvFilelist: hex_key = csvFilename + "_" + str(trial) + ".hex" # can't import the dir again while the src file is being parsed parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+csvFilename, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, intermediateResults=DO_INTERMEDIATE_RESULTS, noPoll=True) trial += 1 # can't unlock while jobs are running # Session WARN: java.lang.UnsupportedOperationException: Cannot unlock all keys since locking jobs are still running. h2j.pollWaitJobs() h2o.n0.unlock()
def test_GLM2grid_covtype_many(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=20) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_overlap_same_dest_nopoll(self): for num_trials in range(30): csvPathname = 'poker/poker-hand-testing.data' src_key = csvPathname hex_key = csvPathname + '.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': '4,100', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: print "\nfirst GBMResult:", h2o.dump_json(GBMResult) statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." # FIX! after gbm grid, have to get the model keys from the json? gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1==0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_parse_nflx_loop_hdfs_fvec(self): h2o.beta_features = True print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_GBM_parseTrain(self): #folderpath, filename, keyname, timeout bucket = 'home-0xdiag-datasets' files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex',1800,0) ] grid = [[1,10,100,1000], [0.0,0.01,0.001,0.0001,1], [1,2], [1,10,100]] grid = list(itertools.product(*grid)) grid = random.sample(grid, 10) #don't do all 120, but get a random sample for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] csv_header = ('nJVMs','java_heap_GB', 'dataset', 'ntrees', 'max_depth', 'learn_rate', 'min_rows','trainTime') for ntree, learn_rate, max_depth, min_rows in grid: if not os.path.exists('gbm_grid.csv'): output = open('gbm_grid.csv', 'w') output.write(','.join(csv_header)+'\n') else: output = open('gbm_grid.csv', 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') java_heap_GB = h2o.nodes[0].java_heap_GB params = { 'destination_key': 'GBMKEY', 'learn_rate': learn_rate, 'ntrees':ntree, 'max_depth':max_depth, 'min_rows':min_rows, 'response':response } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True #noPoll -> False when GBM finished start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult,noPoll=True,timeoutSecs=timeoutSecs,**kwargs) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=3600,pollTimeoutSecs=3600) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) #print GBMResult GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs'] elapsed = time.time() - start row = {'nJVMs':len(h2o.nodes),'java_heap_GB':java_heap_GB,'dataset':'mnist_training.csv.gz', 'learn_rate':learn_rate,'ntrees':ntree,'max_depth':max_depth, 'min_rows':min_rows, 'trainTime':elapsed} print row csvWrt.writerow(row)
def test_GBMGrid_basic_many(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(5): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
def test_GBM_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = [ 'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] modelKey = 'GBM_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'validation': parseResult['destination_key'], 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': 10, 'max_depth': 20, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_c9b_GBM_airlines_hdfs(self): h2o.beta_features = True files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** # passes 5, fails 15 # for depth in [5,15,25,40]: for depth in [5,5,5,5,5]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_GLM_prostate(self): h2o.beta_features=True importFolderPath = "logreg" csvFilename = 'prostate.csv' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=180, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = 'binomial' alpha = '0.5' lambda_ = '1E-4' nfolds = '5' # fails nfolds = '0' case_mode = '=' case_val = '1' f = 'prostate' modelKey = 'GLM(' + f + ')' kwargs = { 'response' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, # passes if 0, fails otherwise #'case_mode' : case_mode, #'case_val' : case_val, 'destination_key' : modelKey, } BUG1 = True timeoutSecs = 60 start = time.time() glmFirstResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=0.25, pollTimeoutSecs=180, noPoll=BUG1, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) print "FIX! how do we get the GLM result" # hack it! job_key = glmFirstResult['job_key'] # is the job finishing before polling would say it's done? params = {'job_key': job_key, 'destination_key': modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMProgressPage2.json", params=params) print "GLM result from completion_redirect:", h2o.dump_json(a)
def test_GBMGrid_basic_many(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(50 if DO_FAIL_CASE else 10): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
def test_B_kmeans_benign(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right? print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180, noPoll=not DO_POLL, doSummary=False) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) parseResult['destination_key'] = hex_key inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename expected = [ ([10.5, 2.8, 40.3, 0.0, 12.0, 0.8, 1.6, 21.1, 11.4, 0.7, 2.9, 206.2, 36.7, 1.5], 15, 0) , ([23.72897196261682, 2.3271028037383177, 44.81308411214953, 0.34579439252336447, 13.093457943925234, 1.4579439252336448, 1.3177570093457944, 24.16129367150993, 13.317757009345794, 0.5071931108136043, 2.6604011393039024, 121.6822429906542, 40.13084112149533, 1.691588785046729], 110, 0) , ([29.2625, 2.7, 48.5125, 0.1625, 12.0625, 1.0375, 1.4875, 23.023665714263917, 12.6875, 0.5073033705353737, 3.090870788693428, 160.95, 43.3, 1.65], 71, 0) , ([38.333333333333336, 2.3333333333333335, 52.666666666666664, 0.0, 14.333333333333334, 2.3333333333333335, 1.6666666666666667, 25.85955047607422, 12.0, 0.5056179761886597, 3.2846442063649497, 261.6666666666667, 43.0, 1.0], 4, 0) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers if DO_IGNORE: kwargs = {'k': 4, 'ignored_cols': 'STR', 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50} else: kwargs = {'k': 4, 'ignored_cols': None, 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'benign_k.hex' ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans)) modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error # make this fvec legal? (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_parse_with_cancel(self): mustWait = 10 importFolderPath = 'standard' timeoutSecs = 500 csvFilenameList = [ ("standard", "covtype.data", 54), ("manyfiles-nflx-gz", "file_1.dat.gz", 378), ("standard", "covtype20x.data", 54), ("manyfiles-nflx-gz", "file_[100-109].dat.gz", 378), ] # just loop on the same file. If remnants exist and are locked, we will blow up? # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500, noPoll=True, doSummary=False) job_key = parseResult['job_key'] # give it a little time to start time.sleep(3) h2o.nodes[0].jobs_cancel(key=job_key) # now wait until the job cancels, and we're idle h2o_jobs.pollWaitJobs(timeoutSecs=30) elapsed = time.time() - start print "Cancelled parse completed in", elapsed, "seconds." h2o.check_sandbox_for_errors() # get a list of keys from storview. 20 is fine..shouldn't be many, since we putfile, not import folder # there maybe a lot since we import the whole "standard" folder # find the ones that pattern match the csvFilename, and inspect them. Might be none storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100) keys = storeViewResult['keys'] for k in keys: keyName = k['key'] print "kevin:", keyName if csvFilename in keyName: h2o_cmd.runInspect(key=keyName) h2o.check_sandbox_for_errors() # This will tell h2o to delete using the key name from the import file, whatever pattern matches to csvFilename # we shouldn't have to do this..the import/parse should be able to overwrite without deleting. # h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) # If you cancel a parse, you aren't allowed to reparse the same file or import a directory with that file, # or cause the key name that the parse would have used, for 5 seconds after the cancel request gets a json # response print "Waiting", mustWait, "seconds before next reparse-cancel." time.sleep(mustWait)
def test_GBMGrid_basic_benign(self): h2o.beta_features = True csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] modelKey = 'GBMGrid_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'STR', 'learn_rate': '.1,.2', 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: # no pattern waits for all print "\nfirst GBMResult:", h2o.dump_json(GBMResult) h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) if 1==0: # FIX! get model? gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def parseImportFolderFile(node=None, csvFilename=None, path=None, key2=None, timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=1, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, **kwargs): if not node: node = h2o.nodes[0] if not csvFilename: raise Exception('parseImportFolderFile: No csvFilename') # We like the short parse key2 name. # We don't drop anything from csvFilename, unlike H2O default if key2 is None: # don't rely on h2o default key name myKey2 = csvFilename + '.hex' else: myKey2 = key2 print "Waiting for the slow parse of the file:", csvFilename # a little hack to redirect import folder tests to an s3 folder if node.redirect_import_folder_to_s3_path: # why no leading / for s3 key here. only one / after s3:/ ? path = re.sub('/home/0xdiag/datasets', 'home-0xdiag-datasets', path) parseKey = parseImportS3File(node, csvFilename, path, myKey2, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll) elif node.redirect_import_folder_to_s3n_path: path = re.sub('/home/0xdiag/datasets', '/home-0xdiag-datasets', path) parseKey = parseImportHdfsFile(node, csvFilename, path, myKey2, "s3n", timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll) else: if getpass.getuser()=='jenkins': print "Now: not doing Temp hack of /home/0xdiag/datasets/standard to /home/0xdiag/datasets" ### path = re.sub('/home/0xdiag/datasets/standard', '/home/0xdiag/datasets', path) importKey = "nfs:/" + path + "/" + csvFilename if h2o.beta_features: print "Temp hack to look at the jobs list for parse completion. No multiple outstanding parses" print "The parse result will be just from the first noPoll response. Parse is done as noPoll" parseKey = node.parse(importKey, myKey2, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll=noPoll or h2o.beta_features, **kwargs) if h2o.beta_features: print "Temp hack to look at the jobs list for parse completion. No multiple outstanding parses" print "The parse result will be just from the first noPoll response." print "\nWaiting on Parse job for ", importKey start = time.time() h2o_jobs.pollWaitJobs(pattern='arse', timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5) print "Parse job end for ", importKey, 'took', time.time() - start, 'seconds' # a hack so we know what the source_key was, bask at the caller parseKey['python_source_key'] = importKey print "\nParse result:", parseKey return parseKey
def test_sequential_same_dest_del(self): csvFilename = 'poker-hand-testing.data' csvPathname = 'poker/' + csvFilename for trials in range(30): src_key = csvPathname hex_key = csvPathname + '.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False) h2o.nodes[0].remove_key(src_key) h2o.nodes[0].remove_key(hex_key) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
def test_KMeansGrid_params_rand2_fvec(self): if h2o.localhost: csvFilenameList = [ # ('covtype.data', 60), ("covtype.data", 800) ] else: csvFilenameList = [("covtype.data", 800)] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60 ) inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) paramDict = define_params(SEED) for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + ".hex" params = {"k": "2,3", "destination_key": destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs ) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_C_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([43.07058823529412, 0.36470588235294116, 67.70588235294117, 1.1058823529411765, 2.3529411764705883, 1.2117647058823529, 17.33529411764706, 14.201176470588232, 6.588235294117647], 103, 0) , ([166.04347826086956, 0.4658385093167702, 66.09316770186335, 1.0807453416149069, 2.3043478260869565, 1.0807453416149069, 15.0632298136646, 16.211118012422357, 6.527950310559007], 136, 0) , ([313.4029850746269, 0.35074626865671643, 64.91791044776119, 1.0820895522388059, 2.1791044776119404, 1.0746268656716418, 14.601492537313437, 16.35686567164179, 6.082089552238806], 141, 0) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) kwargs = {'k': 3, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex', 'max_iter': 50, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'prostate_k.hex' # FIX! how do I get the kmeans result? ### print "kmeans result:", h2o.dump_json(kmeans) # can't do this # inspect = h2o_cmd.runInspect(key='prostate_k.hex') modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error # variance of 0 might be legal with duplicated rows. wasn't able to remove the duplicate rows of NAs at # bottom of benign.csv in ec2 # for i,c in enumerate(cluster_variances): # if c < 0.1: # raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i)) # make this fvec legal? (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** for depth in [5, 15]: params = { "destination_key": "GBMKEY", "learn_rate": 0.2, "nbins": 1024, "ntrees": 10, "max_depth": depth, "min_rows": 10, "response": response, "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed", } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() # noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = (statMean["num_cpus"],) my_cpu_pct = (statMean["my_cpu_%"],) sys_cpu_pct = (statMean["sys_cpu_%"],) system_load = statMean["system_load"] # shouldn't need this? h2j.pollWaitJobs( pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5 ) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') # print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_KMeansGrid_params_rand2_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) paramDict = define_params(SEED) for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + '.hex' params = {'k': '2,3', 'destination_key': destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_B_kmeans_benign(self): importFolderPath = "standard" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right? parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180, noPoll=h2o.beta_features, doSummary=False) if h2o.beta_features: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) parseResult['destination_key'] = hex_key inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for k in range(2, 6): kwargs = {'k': k, 'ignored_cols_by_name': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 10}) kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs) if h2o.beta_features: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'benign_k.hex' ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans)) modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error
def test_KMeansGrid_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) h2o.beta_features = True # no grid for VA for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + '.hex' params = {'k': 'c(2,3)', 'destination_key': destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_overlap_diff_dest_stallN(self): noPoll = True num_trials = 0 stallForNJobs = 25 for i in range(2): for j in range(30): csvFilename = 'poker-hand-testing.data' csvPathname = 'poker/' + csvFilename src_key = csvFilename + "_" + str(i) + "_" + str(j) hex_key = csvFilename + "_" + str(num_trials) + '.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=noPoll, doSummary=False) num_trials += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5,stallForNJobs=stallForNJobs)
def test_GBM_parseTrain(self): bucket = 'home-0xdiag-datasets' files = [('standard', 'covtype200x.data', 'covtype.hex', 1800, 54), ('mnist', 'mnist8m.csv', 'mnist8m.hex', 1800, 0), ('manyfiles-nflx-gz', 'file_95.dat.gz', 'nflx.hex', 1800, 256), ('standard', 'allyears2k.csv', 'allyears2k.hex', 1800, 'IsArrDelayed'), ('standard', 'allyears.csv', 'allyears2k.hex', 1800, 'IsArrDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 1, 'max_depth': 1, 'min_rows': 1, 'response': response } print "Using these parameters for GBM: ", params kwargs = params.copy() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=1800, pollTimeoutSecs=1800) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs']
def test_C_kmeans_prostate(self): importFolderPath = "standard" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for k in range(2, 6): kwargs = {'k': k, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50}) kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs) if h2o.beta_features: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'prostate_k.hex' # FIX! how do I get the kmeans result? ### print "kmeans result:", h2o.dump_json(kmeans) # can't do this # inspect = h2o_cmd.runInspect(key='prostate_k.hex') modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error for i,c in enumerate(cluster_variances): if c < 0.1: raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i))
def test_small_parse_overlap_same_dest(self): noPoll = True timeoutSecs = 180 num_trials = 0 stallForNJobs = 100 for i in range(50): for j in range(200): csvPathname = h2o.find_file('smalldata/poker') csvFilename = csvPathname + '/' + 'poker-hand-testing.data' key = csvFilename + "_" + str(i) + "_" + str(j) key2 = key + "_" + str(num_trials) + '.hex' parseKey = h2o_cmd.parseFile(csvPathname=csvFilename, key=key, key2=key2, timeoutSecs=timeoutSecs, noPoll=noPoll, doSummary=False) num_trials += 1 h2o_jobs.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5,stallForNJobs=stallForNJobs)
def test_PCA_UCIwine(self): csvFilename = "wine.data" timeoutSecs = 300 trialStart = time.time() #parse trainKey = "wine.hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] #PCA params params = { 'destination_key': "python_PCA_key", 'tolerance': 0.0, 'standardize': 1 } kwargs = params.copy() h2o.beta_features = True #TODO(spencer): Hack around no polling FVEC PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) #time.sleep(100) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) #check PCA results pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key") h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView)
def test_exec2_fast_locks_overlap(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_RF_poker100(self): MISSING_RESPONSE = False DO_MODEL_INSPECT = False trees = ",".join(map(str, range(10, 50, 2))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append((job_key, model_key)) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view( job_key=job_key, destination_key=model_key) print "speedrf grid result for %s:", h2o.dump_json(gridResult) print "speedrf grid result errors:", gridResult[ 'prediction_errors'] for i, j in enumerate(gridResult['jobs']): if DO_MODEL_INSPECT: print "\nspeedrf result %s:" % i, h2o.dump_json( h2o_cmd.runInspect(key=j['destination_key'])) else: # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) model = h2o.nodes[0].speedrf_view( modelKey=j['destination_key']) print "model:", h2o.dump_json(model)
def test_GLM2_big1_nopoll(self): h2o.beta_features = True csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(5): kwargs = {'response': y, 'n_folds': 1, 'family': 'binomial'} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for g in glmInitial: print "Checking completed job, with no polling using initial response:" # this format is only in the first glm response (race?) modelKey = g['destination_key'] glm = h2o.nodes[0].glm_view(_modelKey=modelKey) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_flashgordon(self): # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 csvFilenameList = [ ("100.dat.gz", "dat_1", 1 * avgSynSize, 700), ("11[0-9].dat.gz", "dat_10", 10 * avgSynSize, 700), ("1[32][0-9].dat.gz", "dat_20", 20 * avgSynSize, 800), ("1[5-9][0-9].dat.gz", "dat_50", 50 * avgSynSize, 900), # ("1[0-9][0-9].dat.gz", "dat_100", 100 * avgSynSize, 1200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz USE_S3 = False noPoll = True benchmarkLogging = ['cpu', 'disk'] bucket = "home-0xdiag-datasets" if USE_S3: URI = "s3://flashgordon" protocol = "s3" else: URI = "s3n://flashgordon" protocol = "s3n/hdfs" # split out the pattern match and the filename used for the hex trialMax = 1 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): ## for tryHeap in [54, 28]: for tryHeap in [54]: print "\n", tryHeap, "GB heap, 1 jvm per host, import", protocol, "then parse" h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importResult['succeeded'] for k in s3nFullList: key = k['key'] # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in key: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key break else: ### print key pass ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?") s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs( pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed print "\nMB/sec (before uncompress)", "%6.2f" % fileMBS h2o.cloudPerfH2O.message( '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs' .format(len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)) # BUG here? if not noPoll: # We should be able to see the parse result? inspect = h2o_cmd.runInspect( key=parseKey['destination_key']) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) for k in s3nFullList: deleteKey = k['key'] if csvFilename in deleteKey and not ".hex" in key: pass # h2o removes key after parse now ### print "Removing", deleteKey ### removeKeyResult = h2o.nodes[0].remove_key(key=deleteKey) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(120)
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" date = '-'.join([str(x) for x in list(time.localtime())][0:3]) for f in fs['train']: overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(gbmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x', 'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect( parseResult['destination_key']) inspect_test = h2o.nodes[0].inspect(testFilehex) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len( h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['numRows'], 'nTestRows': inspect_test['numRows'], 'nCols': inspect_train['numCols'], 'trainParseWallTime': parseWallTime, 'classification': classification, }) params = { 'destination_key': 'GBM(' + f + ')', 'response': response, 'ignored_cols_by_name': ignored_cols, 'classification': classification, 'validation': testFilehex, 'ntrees': ntrees, 'max_depth': depth, 'min_rows': minrows, 'nbins': nbins, 'learn_rate': learnRate, } kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion h2o.beta_features = True gbm = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5) h2o.beta_features = False gbmTime = time.time() - gbmStart row.update({ 'gbmBuildTime': gbmTime, }) #TODO(spencer): Add in gbm scoring #gbmScoreStart = time.time() #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - gbmScoreStart csvWrt.writerow(row) finally: output.close()
def test_rf_big1_overwrite_model_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntrees'] = 1 + jobDispatch else: kwargs['ntrees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=False) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] # add 1 for start-with-1 ignored_cols_by_name = ",".join( map(lambda x: "C" + str(x + 1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str( response + 1) ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response + 1), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_benchmark_import(self): covtype200xSize = 15033863400 csvFilenameList = [ ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), ] trialMax = 1 base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap/2, base_port=base_port, enable_benchmark_log=True) for trial in range(trialMax): csvPathname = "/home/0xdiag/datasets/standard/" + csvFilepattern h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern #********************************************************************************** if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(54) # don't include the output column x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 54, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o.tear_down_cloud() sys.stdout.write('.') sys.stdout.flush()