def test_delete_all_keys(self): # FIX! should have some model keys in here too, from RF etc. importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll for trial in range(3): for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) elapsed = time.time() - start print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" print csvFilename, 'H2O reports parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename print "Delete all keys" h2o.nodes[0].remove_all_keys() print "This shouldn't see any keys" h2i.delete_keys_at_all_nodes() print "\nTrial", trial, "completed\n"
def test_delete_all_keys(self): # FIX! should have some model keys in here too, from RF etc. importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll for trial in range(2): for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList h2i.delete_keys_at_all_nodes() print "Delete all keys. Shouldn't be any more?" h2o.nodes[0].remove_all_keys() print "\nTrial", trial, "completed\n"
def test_delete_all_keys(self): h2o.beta_features = True # FIX! should have some model keys in here too, from RF etc. importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll for trial in range(3): for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) elapsed = time.time() - start print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs), "\n" print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename print "Delete all keys" h2o.nodes[0].remove_all_keys() print "This shouldn't see any keys" h2i.delete_keys_at_all_nodes() print "\nTrial", trial, "completed\n"
def test_from_import_fvec(self): csvFilenameAll = [ ("covtype.data", 500), # ("covtype20x.data", 1000), ] for (csvFilename, timeoutSecs) in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult['destination_key']) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) trees = 2 start = time.time() rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_from_import(self): timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # pop open a browser on the cloud # h2b.browseTheCloud() for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='put', hex_key=hex_key, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) if not h2o.beta_features: RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (10, 100000, 'cA', 200, 200), (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_parse_100k_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() # does it blow up if it sets columnNames? parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS) print "Parse:", csvFilename, "took", time.time() - start, "seconds" print "Skipping the row/cols check for now" if 1==1: start = time.time() inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=timeoutSecs2) print "Inspect:", hex_key, "took", time.time() - start, "seconds" missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) print "Skipping the delete keys for now" if 1==0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_parse_100k_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print "Skipping the delete keys for now" if 1 == 0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def tearDownClass(cls): # DON"T ### h2o.tear_down_cloud() # Instead: All tests should delete their keys..i.e. leave things clean for the next test start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False): if h2o_args.sleep_at_tear_down: print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)" import h2o_browse as h2b h2b.browseTheCloud() sleep(3600) if not nodeList: nodeList = h2o_nodes.nodes # this could fail too. Should this be set by -uc/--usecloud? or command line argument if nodeList and nodeList[0].delete_keys_at_teardown: start = time.time() h2i.delete_keys_at_all_nodes(timeoutSecs=300) elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs" # could the nodeList still be empty in some exception cases? Assume not for now # FIX! don't send shutdown if we're using an existing cloud # also, copy the "delete keys at teardown from testdir_release # Assume there's a last "test" that's run to shutdown the cloud # don't tear down with -ccj either # FIX! what about usecloud or cloud_cloud_json params from build_cloud time? if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json): try: # update: send a shutdown to all nodes. # h2o maybe doesn't progagate well if sent to one node # the api watchdog shouldn't complain about this? # just send one? # for n in nodeList: # n.shutdown_all() h2o_nodes.nodes[0].shutdown_all() except: pass # ah subtle. we might get excepts in issuing the shutdown, don't abort out # of trying the process kills if we get any shutdown exception (remember we go to all nodes) # so we might? nodes are shutting down? # FIX! should we wait a bit for a clean shutdown, before we process kill? # It can take more than 1 sec though. try: time.sleep(2) for n in nodeList: n.terminate() verboseprint("tear_down_cloud n:", n) except: pass check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name) # get rid of all those pesky line marker files. Unneeded now clean_sandbox_doneToLine() nodeList[:] = [] h2o_nodes.nodes = []
def test_parse_manyfiles_1(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key, delete_on_done=DELETE_ON_DONE, # importParentDir=IMPORT_PARENT_DIR, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) # convert to binomial if DO_EXEC: execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) if DO_DELETE_MYSELF: h2o_import.delete_keys_at_all_nodes() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_c9b_GBM_airlines_hdfs(self): h2o.beta_features = True files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** # passes 5, fails 15 # for depth in [5,15,25,40]: for depth in [5,5,5,5,5]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_c8_rf_airlines_hdfs(self): trainParseResult = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def tearDownClass(cls): print "tearDownClass" # DON"T ### h2o.tear_down_cloud() # this could fail too if h2o.nodes[0].delete_keys_at_teardown: start = time.time() h2i.delete_keys_at_all_nodes(timeoutSecs=300) elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_parse_long_colnames(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 1100, 'cA', 200, 200), (10, 1200, 'cA', 200, 200), (10, 1300, 'cA', 200, 200), (10, 1400, 'cA', 200, 200), (10, 1500, 'cA', 200, 200), (10, 1600, 'cA', 200, 200), (10, 1700, 'cA', 200, 200), (10, 1800, 'cA', 200, 200), (10, 1900, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), # (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, column_names=None, intermediateResults=DO_INTERMEDIATE_RESULTS) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print "Skipping the delete keys for now" if 1==0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** for depth in [5, 15]: params = { "destination_key": "GBMKEY", "learn_rate": 0.2, "nbins": 1024, "ntrees": 10, "max_depth": depth, "min_rows": 10, "response": response, "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed", } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() # noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = (statMean["num_cpus"],) my_cpu_pct = (statMean["my_cpu_%"],) sys_cpu_pct = (statMean["sys_cpu_%"],) system_load = statMean["system_load"] # shouldn't need this? h2j.pollWaitJobs( pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5 ) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') # print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n" + l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def tearDownClass(cls): print "tearDownClass" # DON"T ### h2o.tear_down_cloud() # try to download the logs...may fail again! h2o.nodes[0].log_download() # this could fail too if h2o.nodes[0].delete_keys_at_teardown: start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_GBM_mnist_restart_many(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs = 1800 trialStart = time.time() for trial in range(10): # PARSE train**************************************** trainKey = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o.beta_features = False # if it fails, should happen within 8 secs time.sleep(8) h2j.cancelAllJobs() h2o.check_sandbox_for_errors() print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial if DO_DELETE_KEYS_AND_CAUSE_PROBLEM: h2i.delete_keys_at_all_nodes()
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (10, 100000, 'cA', 200, 200), (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime < maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_GBM_mnist_restart_many(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs=1800 trialStart = time.time() for trial in range(10): # PARSE train**************************************** trainKey = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o.beta_features = False # if it fails, should happen within 8 secs time.sleep(8) h2j.cancelAllJobs() h2o.check_sandbox_for_errors() print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial if DO_DELETE_KEYS_AND_CAUSE_PROBLEM: h2i.delete_keys_at_all_nodes()
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_from_import(self): h2o.beta_features = True timeoutSecs = 500 csvFilenameAll = ["covtype.data", "covtype20x.data"] # pop open a browser on the cloud # h2b.browseTheCloud() for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + ".hex" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path="standard/" + csvFilename, schema="local", hex_key=hex_key, timeoutSecs=500, doSummary=False, ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult["destination_key"]) summaryResult = h2o_cmd.runSummary(key=parseResult["destination_key"]) h2o_cmd.infoFromSummary(summaryResult) if not h2o.beta_features: RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_from_import(self): timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # pop open a browser on the cloud # h2b.browseTheCloud() for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='put', hex_key=hex_key, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseResult['response'][ 'time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) if not h2o.beta_features: RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_c6_maprfs_fvec(self): h2o.beta_features = True print "\nLoad a list of files from maprfs, parse and do 1 RF tree" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle", "TEST-poker1000.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", # "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", # duplicate column header "A" # "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] # find_cloud.py won't set these correctly. Let's just set them here # h2o.nodes[0].use_maprfs = True # h2o.nodes[0].use_hdfs = False # h2o.nodes[0].hdfs_version = 'mapr3.0.1', # h2o.nodes[0].hdfs_name_node = '192.168.1.171:7222' h2o.setup_benchmark_log() # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] # benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] # pick 8 randomly! if DO_RANDOM_SAMPLE: csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # save the first, for all comparisions, to avoid slow drift with each iteration importFolderPath = "datasets" trial = 0 for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 1000 # do an import first, because we want to get the size of the file (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs) print "importResult:", h2o.dump_json(importResult) succeeded = importResult['files'] fails = importResult['fails'] if len(succeeded) < 1: raise Exception("Should have imported at least 1 key for %s" % csvPathname) # just do a search foundIt = None for f in succeeded: if csvPathname in f: foundIt = f break if not foundIt: raise Exception("Should have found %s in the imported keys for %s" % (importPattern, csvPathname)) totalBytes = 0 print "Loading", csvFilename, 'from maprfs' start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs, pollTimeoutSecs=360, doSummary=True, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features) if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "parse result:", parseResult['destination_key'] elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) if DO_RF: print "\n" + csvFilename start = time.time() kwargs = { 'ntrees': 1 } paramsString = json.dumps(kwargs) RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features, **kwargs) if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) print "Deleting all keys, to make sure our parse times don't include spills" h2i.delete_keys_at_all_nodes() trial += 1
def test_summary2_small(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) # if rowCount is None, we'll just use the data values # None in expected values means no compare (None, 1, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 2, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 100, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 1000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), (None, 10000, 'x.hex', [-1,0,1], ('C1', None, None, -1, None, None)), # (COLS, 1, 'x.hex', [1,0,-1], ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, values, expected) in tryList: # max error = half the bin size? expectedMax = max(values) expectedMin = min(values) maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta # hmm...say we should be 100% accurate for these tests? maxDelta = 0 h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 if not rowCount: rowFile = len(values) else: rowFile = rowCount csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) quantile = 0.5 if DO_MEDIAN else .999 q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] qresult_single = q['result_single'] qresult_iterations = q['iterations'] qresult_interpolated = q['interpolated'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", qresult_iterations) h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated) print h2o.dump_json(q) self.assertLess(qresult_iterations, 16, msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?") # only one column column = summaryResult['summaries'][0] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] if expected[0]: self.assertEqual(colname, expected[0]) if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) pt = twoDecimals(pctile) mx = twoDecimals(maxs) mn = twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 if DO_TRY_SCIPY and colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() print scipyCol, pctile[10] generate_scipy_comparison(csvPathnameFull, col=scipyCol, # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single) h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult) h2i.delete_keys_at_all_nodes()
def test_c5_KMeans_sphere_26GB_fvec(self): # a kludge h2o.setup_benchmark_log() # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' csvFilename = "syn_sphere15_gen_26GB.csv" # csvFilename = 'syn_sphere_gen_h1m.csv' # csvFilename = 'syn_sphere_gen_real_1.49M.csv' # csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? if NA_COL_BUG: expected = [ # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB, # so shouldn't be used for 26GB # or it should be divided by 7 # the distribution is the same, obviously. ( [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988, ), ( [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98, ), ( [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253, ), ( [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474, ), ( [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094, ), ( [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475, ), ( [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035, ), ( [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276, ), ( [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314, ), ( [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955, ), ( [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215, ), ( [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249, ), ( [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379, ), ( [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982, ), ( [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646, ), ] else: expected = [ ( [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988, ), ( [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98, ), ( [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253, ), ( [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474, ), ( [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094, ), ( [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475, ), ( [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035, ), ( [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276, ), ( [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314, ), ( [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955, ), ( [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215, ), ( [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249, ), ( [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379, ), ( [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982, ), ( [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646, ), ] benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"] benchmarkLogging = ["cpu", "disk", "network", "iostats"] # IOStatus can hang? benchmarkLogging = ["cpu", "disk", "network"] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs ) else: parseResult = h2i.import_parse( path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs ) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed ) print "\n" + l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300) numRows = inspect["numRows"] numCols = inspect["numCols"] summary = h2o_cmd.runSummary( key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300 ) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { "k": 15, "max_iter": 500, # 'normalize': 1, "normalize": 0, # temp try "initialization": "Furthest", "destination_key": "junk.hex", # we get NaNs if whole col is NA "ignored_cols": "C1", "normalize": 0, # reuse the same seed, to get deterministic results "seed": 265211114317615310, } if (trial % 3) == 0: kwargs["initialization"] = "PlusPlus" elif (trial % 3) == 1: kwargs["initialization"] = "Furthest" else: kwargs["initialization"] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeansResult = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) print "kmeans result:", h2o.dump_json(kmeansResult) l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString, ) print l h2o.cloudPerfH2O.message(l) # his does predict (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, "d", **kwargs ) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected( self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial ) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. model = kmeansResult["model"] size = model["size"] size2 = [t[1] for t in tupleResultList] if 1 == 1: # debug print "training size:", size print "predict size2:", size2 print "training sorted(size):", sorted(size) print "predict sorted(size2):", sorted(size2) print h2o.nodes[0].http_addr print h2o.nodes[0].port clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] print "iterations", iterations if iterations >= (max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (trial, iterations, max_iter), ) # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure # can't do this compare, because size2 is sorted by center order.. # so we don't know how to reorder size the same way # we could just sort the two of them, for some bit of comparison. if sorted(size) != sorted(size2): raise Exception( "trial: %s training cluster sizes: %s not the same as predict on same data: %s" % (trial, size, size2) ) # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram expectedSize = [t[1] / SCALE_SIZE for t in expected] if size2 != expectedSize: raise Exception( "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize) ) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.beta_features = False h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # remove the output too! (378) ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") # convert to binomial execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_c9_GLM_airlines_fvec(self): h2o.beta_features = True files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 30, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column for i in range(colCount): column = summaryResult['summaries'][i] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] self.assertEqual( nacnt, expectedNaCnt[i], "Column %s Expected %s. nacnt %s incorrect" % (i, expectedNaCnt[i], nacnt)) stats = column['stats'] stattype = stats['type'] self.assertEqual(stattype, 'Enum') # FIX! we should compare mean and sd to expected? cardinality = stats['cardinality'] hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] self.assertEqual(hbrk, [expected[0], expected[1]]) hcnt = column['hcnt'] hcntTotal = hcnt[0] + hcnt[1] self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i]) self.assertEqual(rowCount, numRows, msg="numRows %s should be %s" % (numRows, rowCount)) trial += 1 h2i.delete_keys_at_all_nodes()
def test_c6_hdfs(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle", "TEST-poker1000.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", # "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", # has duplicated col name # "hhp2.os.noisy.0_1.data", # "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] csvFilenameAll = [ "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", ] # find_cloud.py won't set these correctly. Let's just set them here # update: look in the runner*sh ..the find_cloud.py has args now to get it right # we have two cdh's though. I guess we're going to use whatever got setup # h2o.nodes[0].use_maprfs = False # h2o.nodes[0].use_hdfs = True # h2o.nodes[0].hdfs_version = 'hdp2.1' # h2o.nodes[0].hdfs_name_node = '172.16.2.186' h2o.setup_benchmark_log() # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] # benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] # pick 8 randomly! if DO_RANDOM_SAMPLE: csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # save the first, for all comparisions, to avoid slow drift with each iteration importFolderPath = "datasets" trial = 0 for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 1000 # do an import first, because we want to get the size of the file print "Loading", csvFilename, 'from hdfs' start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs, doSummary=True, benchmarkLogging=benchmarkLogging) print "parse result:", parseResult['destination_key'] elapsed = time.time() - start l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) if DO_RF: print "\n" + csvFilename start = time.time() kwargs = { 'ntree': 1 } paramsString = json.dumps(kwargs) RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) if 1==0: print "Deleting all keys, to make sure our parse times don't include spills" h2i.delete_keys_at_all_nodes() trial += 1
def test_anomaly_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, COLS, 'x.hex', 1, 20000), (ROWS, COLS, 'x.hex', -5000, 0), (ROWS, COLS, 'x.hex', -100000, 100000), (ROWS, COLS, 'x.hex', -1, 1), (ROWS, COLS, 'A.hex', 1, 100), (ROWS, COLS, 'A.hex', -99, 99), (ROWS, COLS, 'B.hex', 1, 10000), (ROWS, COLS, 'B.hex', -100, 100), (ROWS, COLS, 'C.hex', 1, 100000), (ROWS, COLS, 'C.hex', -101, 101), ] trial = 1 x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numRows = inspect["numRows"] numCols = inspect["numCols"] print "numRows:", numRows, "numCols:", numCols model_key = "m.hex" kwargs = { 'ignored_cols': None, 'response': numCols - 1, 'classification': 0, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, # 'loss' : 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 2.0, 'destination_key': model_key, # 'validation' : None, 'score_interval': 10000, 'autoencoder': 1, } timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end. took", time.time() - start, "seconds" kwargs = { 'destination_key': "a.hex", 'source': parseResult['destination_key'], 'dl_autoencoder_model': model_key, 'thresh': 1.0 } anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs) inspect = h2o_cmd.runInspect(None, "a.hex") numRows = inspect["numRows"] numCols = inspect["numCols"] print "anomaly: numRows:", numRows, "numCols:", numCols self.assertEqual(numCols, 1) # twice as many rows because of NA injection self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO)) # first col has the anomaly info. other cols are the same as orig data aSummary = h2o_cmd.runSummary(key='a.hex', cols=0) h2o_cmd.infoFromSummary(aSummary) print "anomaly:", h2o.dump_json(anomaly) trial += 1 h2i.delete_keys_at_all_nodes()
def test_c9_GLM_rc_fvec(self): h2o.beta_features = True files = [ ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800, None) ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() # avoid printing the coefficient names in jenkins output # the last col is the response, so we use a number to point to it below parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, header=0, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numRows = inspect['numRows'] numCols = inspect['numCols'] response = numCols-1 # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 0, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_c5_KMeans_sphere_67MB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 67306997 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # clear out all NAs (walk across cols)..clear to 0 # temp ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key) ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 10, 'normalize': 1, 'initialization': 'Furthest', 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) h2i.delete_keys_at_all_nodes()
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_KMeans_sphere15_180GB(self): # a kludge h2o.setup_benchmark_log() csvFilename = "syn_sphere15_2711545732row_6col_180GB_from_7x.csv" totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? # removed 0's from first col because we set "ignored_cols" expected = [ ( [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988, ), ( [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98, ), ( [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253, ), ( [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474, ), ( [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094, ), ( [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475, ), ( [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035, ), ( [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276, ), ( [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314, ), ( [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955, ), ( [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215, ), ( [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249, ), ( [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379, ), ( [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982, ), ( [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646, ), ] benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"] benchmarkLogging = ["cpu", "disk", "network", "iostats"] # IOStatus can hang? benchmarkLogging = ["cpu", "disk", "network"] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs ) else: parseResult = h2i.import_parse( path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs ) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed ) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { "max_iter": 30, "k": 15, "initialization": "Furthest", "cols": None, "destination_key": "junk.hex", # reuse the same seed, to get deterministic results "seed": 265211114317615310, } if (trial % 3) == 0: kwargs["initialization"] = "PlusPlus" elif (trial % 3) == 1: kwargs["initialization"] = "Furthest" else: kwargs["initialization"] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString, ) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, "d", **kwargs ) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected( self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial ) h2i.delete_keys_at_all_nodes()
def test_c5_KMeans_sphere_26GB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' csvFilename = 'syn_sphere15_gen_26GB.csv' # csvFilename = 'syn_sphere_gen_h1m.csv' # csvFilename = 'syn_sphere_gen_real_1.49M.csv' # csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? if NA_COL_BUG: expected = [ # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB, # so shouldn't be used for 26GB # or it should be divided by 7 # the distribution is the same, obviously. ([ -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] else: expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) else: parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=300) numRows = inspect['numRows'] numCols = inspect['numCols'] summary = h2o_cmd.runSummary(key=parseResult['destination_key'], numRows=numRows, numCols=numCols, timeoutSecs=300) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 500, # 'normalize': 1, 'normalize': 0, # temp try 'initialization': 'Furthest', 'destination_key': 'junk.hex', # we get NaNs if whole col is NA 'ignored_cols': 'C1', 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeansResult) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) # his does predict (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. model = kmeansResult['model'] size = model['size'] size2 = [t[1] for t in tupleResultList] if 1 == 1: # debug print "training size:", size print "predict size2:", size2 print "training sorted(size):", sorted(size) print "predict sorted(size2):", sorted(size2) print h2o.nodes[0].http_addr print h2o.nodes[0].port clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] print "iterations", iterations if iterations >= ( max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (trial, iterations, max_iter)) # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure # can't do this compare, because size2 is sorted by center order.. # so we don't know how to reorder size the same way # we could just sort the two of them, for some bit of comparison. if sorted(size) != sorted(size2): raise Exception( "trial: %s training cluster sizes: %s not the same as predict on same data: %s" % (trial, size, size2)) # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram expectedSize = [t[1] / SCALE_SIZE for t in expected] if size2 != expectedSize: raise Exception( "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def test_G_StoreView(self): h2i.delete_keys_at_all_nodes(timeoutSecs=30)
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** for depth in [5, 15]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % ( GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def test_summary2_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2*e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_summary2_uniform_w_NA(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100, 00)), (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "" print "hcnt:", hcnt print "len(hcnt)", len(hcnt) print "numRows:", numRows, "rowCount: ", rowCount self.assertEqual((1 + NA_ROW_RATIO) * rowCount, numRows, msg="numRows %s should be %s" % (numRows, (1 + NA_ROW_RATIO) * rowCount)) # don't check the last bin # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = rowCount / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # don't check the edge bins # NA rows should be ignored self.assertAlmostEqual(b, e, delta=2 * e, msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 1 h2i.delete_keys_at_all_nodes()
def test_c6_hdfs_fvec(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle", "TEST-poker1000.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", # "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", # has duplicated col name # "hhp2.os.noisy.0_1.data", # "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] csvFilenameAll = [ "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", "covtype4x.shuffle.data", ] # find_cloud.py won't set these correctly. Let's just set them here # we have two cdh's though. I guess we're going to use whatever got setup # h2o.nodes[0].use_maprfs = False # h2o.nodes[0].use_hdfs = True # h2o.nodes[0].hdfs_version = 'cdh4' # h2o.nodes[0].hdfs_name_node = '172.16.2.176' h2o.setup_benchmark_log() # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] # benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] # pick 8 randomly! if DO_RANDOM_SAMPLE: csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # save the first, for all comparisions, to avoid slow drift with each iteration importFolderPath = "datasets" trial = 0 for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 1000 # do an import first, because we want to get the size of the file print "Loading", csvFilename, 'from hdfs' start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs, doSummary=False, benchmarkLogging=benchmarkLogging) print "parse result:", parseResult['destination_key'] elapsed = time.time() - start l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) if DO_RF: print "\n" + csvFilename start = time.time() kwargs = {'ntrees': 1} paramsString = json.dumps(kwargs) RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) if 1 == 0: print "Deleting all keys, to make sure our parse times don't include spills" h2i.delete_keys_at_all_nodes() trial += 1
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # output 378 can't be in this ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 10, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541 ]: x.remove(i) ignore_x.append(i) # plus 1 because we are no longer 0 offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_rand_inspect(self): ### h2b.browseTheCloud() ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = csvFilename + ".hex" print "\n" + csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) destination_key = parseResult['destination_key'] print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX, destination_key, offset, view, inspectOld=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) if h2o.beta_features: pass # print "Inspect2:", h2o.dump_json(inspectNew) else: pass # print "Inspect:", h2o.dump_json(inspectNew) # FIX! get min/max/mean/variance for a col too? constantNames = [ ('num_cols', 'numCols'), ('num_rows', 'numRows'), ('value_size_bytes', 'byteSize'), ('cols', 'cols'), ] colNames = [ ('num_missing_values', 'naCnt'), ] for (i, j) in constantNames: # check the fields, even if you don't have a previous one to compare to if h2o.beta_features: # hack in extra info for now, from the new names to old names if not j in inspectNew: raise Exception( "Can't find %s, Inspect2 result should have it?" % j) inspectNew[i] = inspectNew[j] # don't compare if cols if inspectOld and i != 'cols': if h2o.beta_features and i == 'value_size_bytes': # Inspect2 should be smaller self.assertGreater(inspectOld[i], inspectNew[i]) else: # for cols it will just compare length? self.assertEqual(inspectOld[i], inspectNew[i]) if i == 'cols': for (m, n) in colNames: if h2o.beta_features: if not n in inspectNew[i][0]: print h2o.dump_json(inspectNew[i][0]) raise Exception( "Can't find %s, Inspect2 result['cols'][0] should have it?" % n) inspectNew[i][0][m] = inspectNew[i][0][n] # just compare 0 if inspectOld is not None: self.assertEqual(inspectOld[i][0][m], inspectNew[i][0][m]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0, destination_key, 0, 1, None) h2o.verboseprint(h2o.dump_json(origInspect)) origStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for trial in range(10): h2p.green_print("\nTrial", trial) # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0, lenNodes - 1) print "nodeX:", nodeX, "offset:", offset, "view:", view h2o.beta_features = False inspect_and_check(nodeX, destination_key, offset, view, origInspect) print "trying Inspect2 by flipping h2o.nodes[0].beta_features" h2o.beta_features = True # delay between the two inspects...bug around not getting autoframe in storeview? time.sleep(1) inspect_and_check(nodeX, destination_key, offset, view, origInspect) h2o.beta_features = False # a fvec frame should have been created in the storeView time.sleep(1) # loop looking for the autoframe to show up # o = len(origStoreViewResult['keys']) o = h2i.count_keys_at_all_nodes() retry = 0 okay = False while retry == 0 or not okay: newStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60) ## p = len(newStoreViewResult['keys']) p = h2i.count_keys_at_all_nodes() print "number of keys in the two StoreViews, o:", o, "p:", p ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult) oOkay = {1, 2, 3, 4, 5, 6, 7, 8} pOkay = {1, 2, 3, 4, 5} print o, pOkay, p, oOkay if (o in oOkay) and (p in pOkay): print "Good" okay = True else: print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % ( o, p) if retry == 10: raise Exception( "StoreView didn't get autoframe, after %s retries" % retry) ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView") # so he gets recreated?? deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe') # The autoframe key may not show up!! if INVISIBLE_AUTOFRAME: # can be 1 or 2 if not (deleted == 0 or deleted == 1): msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted raise Exception(msg) else: # can be 1 or 2 if not (deleted == 1): msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted time.sleep(1) retry += 1
def test_c6_maprfs_fvec(self): h2o.beta_features = True print "\nLoad a list of files from maprfs, parse and do 1 RF tree" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle", "TEST-poker1000.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", # "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", # duplicate column header "A" # "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] # find_cloud.py won't set these correctly. Let's just set them here # h2o.nodes[0].use_maprfs = True # h2o.nodes[0].use_hdfs = False # h2o.nodes[0].hdfs_version = 'mapr3.0.1', # h2o.nodes[0].hdfs_name_node = '172.16.2.171:7222' h2o.setup_benchmark_log() # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] # benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] # pick 8 randomly! if DO_RANDOM_SAMPLE: csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # save the first, for all comparisions, to avoid slow drift with each iteration importFolderPath = "datasets" trial = 0 for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 1000 # do an import first, because we want to get the size of the file (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs) print "importResult:", h2o.dump_json(importResult) succeeded = importResult['files'] fails = importResult['fails'] if len(succeeded) < 1: raise Exception("Should have imported at least 1 key for %s" % csvPathname) # just do a search foundIt = None for f in succeeded: if csvPathname in f: foundIt = f break if not foundIt: raise Exception( "Should have found %s in the imported keys for %s" % (importPattern, csvPathname)) totalBytes = 0 print "Loading", csvFilename, 'from maprfs' start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs, pollTimeoutSecs=360, doSummary=True, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features) if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "parse result:", parseResult['destination_key'] elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) if DO_RF: print "\n" + csvFilename start = time.time() kwargs = {'ntrees': 1} paramsString = json.dumps(kwargs) RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features, **kwargs) if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) print "Deleting all keys, to make sure our parse times don't include spills" h2i.delete_keys_at_all_nodes() trial += 1