def test_from_import_fvec(self): print "Sets h2o.beat_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True # this will redirect import and parse to the 2 variants importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key'], timeoutSecs=30) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this # FIX! currently the importFolderResult is empty for fvec if 1 == 0: h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) sys.stdout.write('.') sys.stdout.flush()
def test_from_import_fvec(self): print "Sets h2o.beat_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True # this will redirect import and parse to the 2 variants importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key'], timeoutSecs=30) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this # FIX! currently the importFolderResult is empty for fvec if 1==0: h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack', 'iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize / 2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize / 2), 7200), ([ "A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz" ], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap, "GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o_hosts.build_cloud_with_hosts( h2oPerNode, java_heap_GB=tryHeap, # java_extra_args=jea, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: if USE_S3: URI = protocol + "://" + bucket + "/" + csvFolder + "/" else: URI = protocol + "://" + bucket + "/" + csvFolder + "/" # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater( foundKeys, 8, "Didn't see more than 8 files in s3n?") s3nKey = csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs( pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y ]: x.remove(i) x = ",".join(map(str, x)) if DO_GLM: algo = 'GLM' GLMkwargs = { 'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = { 'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGridOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_parse_10k_files(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv.gz" headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10 maxFilenum = 10000 for filenum in range(maxFilenum): rowData = rand_rowData() filePrefix = "%04d" % filenum csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData) avgFileSize = os.path.getsize(csvPathname) importFolderPath = SYNDATASETS_DIR print "\nimportFolderPath:", importFolderPath csvFilenameList = [ # try one thousand files first ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv", maxFilenum * avgFileSize, 1200), # try two thousand ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv", maxFilenum * avgFileSize, 1200), ] trialMax = 1 base_port = 54321 tryHeap = 4 DO_GLM = True noPoll = False benchmarkLogging = ['cpu', 'disk', 'iostats'] # , 'jstack' benchmarkLogging = ['cpu', 'disk'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) ### h2b.browseTheCloud() # don't let the config json redirect import folder to s3 or s3n, because # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state) print "This test creates files in syn_datasets for import folder\n" + \ "so h2o and python need to be same machine" h2o.nodes[0].redirect_import_folder_to_s3_path = False h2o.nodes[0].redirect_import_folder_to_s3n_path = False for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['files'] print "importFullList:", importFullList importFailList = importFolderResult['fails'] print "importFailList:", importFailList print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: h2o_cmd.get_columnInfoFromInspect( parseKey['destination_key'], exceptionOnMissingValues=True) print "\n" + csvFilepattern #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: GLMkwargs = { 'y': 0, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o_hosts.build_cloud_with_hosts(h2oPerNode, java_heap_GB=tryHeap, # java_extra_args=jea, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: if USE_S3: URI = protocol + "://" + bucket + "/" + csvFolder + "/" else: URI = protocol + "://" + bucket + "/" + csvFolder + "/" # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") s3nKey = csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_10k_files(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv.gz" headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10 maxFilenum = 10000 for filenum in range(maxFilenum): rowData = rand_rowData() filePrefix = "%04d" % filenum csvPathname = SYNDATASETS_DIR + "/" + filePrefix + "_" + csvFilename write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData) avgFileSize = os.path.getsize(csvPathname) importFolderPath = SYNDATASETS_DIR print "\nimportFolderPath:", importFolderPath csvFilenameList = [ # try one thousand files first ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv", maxFilenum * avgFileSize, 1200), # try two thousand ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv", maxFilenum * avgFileSize, 1200), ] trialMax = 1 base_port = 54321 tryHeap = 4 DO_GLM = True noPoll = False benchmarkLogging = ["cpu", "disk", "iostats"] # , 'jstack' benchmarkLogging = ["cpu", "disk"] pollTimeoutSecs = 120 retryDelaySecs = 10 for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if localhost: h2o.build_cloud(3, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts( 1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True ) ### h2b.browseTheCloud() # don't let the config json redirect import folder to s3 or s3n, because # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state) print "This test creates files in syn_datasets for import folder\n" + "so h2o and python need to be same machine" h2o.nodes[0].redirect_import_folder_to_s3_path = False h2o.nodes[0].redirect_import_folder_to_s3n_path = False for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult["files"] print "importFullList:", importFullList importFailList = importFolderResult["fails"] print "importFailList:", importFailList print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging, ) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs( pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging ) totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed ) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # BUG here? if not noPoll: h2o_cmd.get_columnInfoFromInspect(parseKey["destination_key"], exceptionOnMissingValues=True) print "\n" + csvFilepattern # ********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: GLMkwargs = { "y": 0, "case": 1, "case_mode": ">", "max_iter": 10, "n_folds": 1, "alpha": 0.2, "lambda": 1e-5, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed ) print l h2o.cloudPerfH2O.message(l) # ********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write(".") sys.stdout.flush()