def test_benchmark_import(self): covtype200xSize = 15033863400 csvFilenameList = [ ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), ] trialMax = 1 base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap/2, base_port=base_port, enable_benchmark_log=True) for trial in range(trialMax): csvPathname = "/home/0xdiag/datasets/standard/" + csvFilepattern h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern #********************************************************************************** if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(54) # don't include the output column x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 54, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o.tear_down_cloud() sys.stdout.write('.') sys.stdout.flush()
def test_GLM_enums_score_superset(self): print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print "\n" + csvFilename missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey) if missingValuesDict: m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()] raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m)) y = colCount kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc
def test_parse_nflx_loop_s3n_hdfs(self): # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 csvFilenameList = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz"), # 100 files takes too long on two machines? # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz DO_GLM = True USE_S3 = False noPoll = False benchmarkLogging = ['cpu','disk','jstack','iostats'] bucket = "home-0xdiag-datasets" if USE_S3: URI = "s3://home-0xdiag-datasets" protocol = "s3" else: URI = "s3n://home-0xdiag-datasets" protocol = "s3n/hdfs" # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): ## for tryHeap in [54, 28]: for tryHeap in [30]: print "\n", tryHeap,"GB heap, 1 jvm per host, import", protocol, "then parse" h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importResult['succeeded'] for k in s3nFullList: key = k['key'] # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in key: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key break else: ### print key pass ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?") s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) #********************************************************************************** # Do GLM too # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column x.remove(3) x.remove(4) x.remove(5) x.remove(6) x.remove(7) x.remove(8) x.remove(9) x.remove(10) x.remove(11) x.remove(14) x.remove(16) x.remove(17) x.remove(18) x.remove(19) x.remove(20) x.remove(424) x.remove(425) x.remove(426) x.remove(540) x.remove(541) # remove the output too! x.remove(378) x = ",".join(map(str,x)) # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.delete_csv_key(csvFilename, s3nFullList) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_GLM_many_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if not localhost: n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] else: n = 150 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # just randomly pick the row and col cases. colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey) if missingValuesDict: m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()] raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m)) y = colCount kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() ### glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_parse_nflx_loop_s3n_hdfs(self): # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 avgAzSize = 75169317 # bytes csvFilenameList = [ ("a", "a_1.dat", 1 * avgAzSize, 300), ("[a-j]", "a_10.dat", 10 * avgAzSize, 300), ("[k-v]", "b_10.dat", 10 * avgAzSize, 800), ("[w-z]", "c_4.dat", 4 * avgAzSize, 900), # ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), # ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), # ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 1800), # ("manyfiles-nflx-gz/file_1[5-9][0-9].dat.gz", "file_50_C.dat.gz", 50 * avgMichalSize, 1800), # ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 2400), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz USE_S3 = False noPoll = True benchmarkLogging = ['cpu','disk'] bucket = "to-delete-soon" if USE_S3: URI = "s3://to-delete-soon" protocol = "s3" else: URI = "s3n://to-delete-soon" protocol = "s3n/hdfs" # split out the pattern match and the filename used for the hex trialMax = 1 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): ## for tryHeap in [54, 28]: for tryHeap in [54]: print "\n", tryHeap,"GB heap, 1 jvm per host, import", protocol, "then parse" h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=180, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importResult['succeeded'] for k in s3nFullList: key = k['key'] # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in key: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key break else: ### print key pass ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?") s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.delete_csv_key(csvFilename, s3nFullList) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_find_numbers(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 3 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness if COL_SEP_HIVE: colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey) if missingValuesDict: m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()] raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m))
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_many_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: n = 4000 tryList = [ (n, 1000, 'cI', 300), ] else: n = 5 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # can randomly pick the row and col cases. ### colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' else: colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) if DO_TEN_INTEGERS: csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv' else: csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey) if missingValuesDict: # we allow some NAs in the list above pass ### m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()] ### raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m)) y = colCount x = range(colCount) x = ",".join(map(str,x)) kwargs = {'x': x, 'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM.json")
def test_parse_1k_files(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv.gz" headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10 maxFilenum = 10000 for filenum in range(maxFilenum): rowData = rand_rowData() filePrefix = "%04d" % filenum csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData) avgFileSize = os.path.getsize(csvPathname) importFolderPath = os.path.abspath(SYNDATASETS_DIR) print "\nimportFolderPath:", importFolderPath csvFilenameList = [ ("*_syn.csv.gz", "syn_all.csv", maxFilenum * avgFileSize, 1200), ] trialMax = 1 base_port = 54321 tryHeap = 4 DO_GLM = True noPoll = False benchmarkLogging = ['cpu','disk', 'iostats'] # , 'jstack' benchmarkLogging = ['cpu','disk'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) ### h2b.browseTheCloud() # don't let the config json redirect import folder to s3 or s3n, because # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state) print "This test creates files in syn_datasets for import folder\n" + \ "so h2o and python need to be same machine" h2o.nodes[0].redirect_import_folder_to_s3_path = False h2o.nodes[0].redirect_import_folder_to_s3n_path = False for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['succeeded'] print "importFullList:", importFullList importFailList = importFolderResult['failed'] print "importFailList:", importFailList print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) print "\n" + csvFilepattern #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: GLMkwargs = {'y': 0, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), ] if 1==1: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1200), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 10 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) noPoll = False benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] pollTimeoutSecs = 120 retryDelaySecs = 10 for (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in csvFilenameList: localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 0: importFolderPath = '/home/0xdiag/datasets/10k_small_gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize, 700 ), ("[1][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize, 700), ("[1][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize, 700), ("[1][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize, 700), ("[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize, 700), ("[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_300_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud( 2, java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts( 1, java_heap_GB=tryHeap / 2, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1200), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 10 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) noPoll = False benchmarkLogging = ['cpu', 'disk', 'iostats', 'jstack'] pollTimeoutSecs = 120 retryDelaySecs = 10 for (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in csvFilenameList: localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: # importFolderPath = '/home/0xdiag/datasets' importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("10k_small_gz/*", "file_400.dat.gz", 10000 * synSize , 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 1800), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*[1-8][0-9][0-9].dat.gz", "file_800.dat.gz", 800 * avgMichalSize, 1800), # ("*[1-4][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 1800), # fails tcp reset # ("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 1800), # fails tcp # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), # fails tcp # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 1800), # ("*1[0-1][0-9].dat.gz", "file_20_2jvm.dat.gz", 20 * avgMichalSize, 1800), # ("*1[0-4][0-9].dat.gz", "file_50_2jvm.dat.gz", 50 * avgMichalSize, 1800), ("*10[0-9].dat.gz", "file_10_2jvm.dat.gz", 10 * avgMichalSize, 1800), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 1800), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_300_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 1800), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_600.dat.gz", 2 * 300 * avgMichalSize, 1800), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 2 * 50 * avgMichalSize, 1800), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1200), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), # ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 4 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['succeeded'] importFailList = importFolderResult['failed'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, importFullList) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()