def test_B_slow_junit(self): h2o.tear_down_cloud() h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name='+h2o.cloud_name(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests 'water.ConcurrentKeyTest', 'hex.MinorityClassTest' ]) # getting UDP receiver stack traces if we shut down quickly after Junit # may need to wait a little bit before shutdown? time.sleep(3) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
def test_D_no_mc_snd(self): print "\nwith flatfile, with multicast disabled" allAcceptIptables() multicastBlockSendIptables() showIptables() h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) h2o.tear_down_cloud()
def test_C_no_mc_rcv(self): print "\nwith flatfile, with multicast disabled" allAcceptIptables() multicastDropReceiveIptables() showIptables() h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) h2o.tear_down_cloud()
def test_Cloud(self): base_port = 54300 ports_per_node = 2 for trials in range(0,5): for tryNodes in range(3,6): sys.stdout.write('.') sys.stdout.flush() start = time.time() # start by cleaning sandbox (in build_cloud). # so nosetest works which doesn't do unit_main # done in build_cloud now ### h2o.write_flatfile(node_count=tryNodes, base_port=base_port) h2o.build_cloud(node_count=tryNodes, java_heap_GB=1, timeoutSecs=30, retryDelaySecs=2, base_port=base_port, use_flatfile=True) print "loop %d: Build cloud of %d in %d s" % (trials, tryNodes, (time.time() - start)) for i in range(2): print "nodes report size: %s consensus: %s expected: %d." % h2o.verify_cloud_size() h2o.tear_down_cloud() # with so many jvms, wait for sticky ports to be freed up..slow os stuff? # changed, to increment the base_port, to avoid reuse immediately time.sleep(1) base_port += ports_per_node * tryNodes
def test_A_all_junit(self): try: h2o.build_cloud(node_count=2, java_heap_GB=3) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Xms3G', '-Xmx3G', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name='+h2o.cloud_name(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-Dh2o.arg.port=54666', '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The all test suite 'water.suites.AllTestsSuite' ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def testCloud(self): baseport = 54300 ports_per_node = 2 print "\nTest was written because seeing a bigger cloud than we want sometimes" print "You'll see the problem in the cloud in the browser" print "\nWorks if real ip address used. fails with 127.0.0.1 (intermittent)" print "Builds cloud with 3, the extra being a non-127.0.0.1 node (the real ip)" print "Eventually it goes away, around 1 minute?" for trial in range(20): for tryNodes in range(2,3): sys.stdout.write('.') sys.stdout.flush() start = time.time() ### this works ### h2o.build_cloud(use_this_ip_addr="192.168.0.37", # this intermittently fails h2o.build_cloud(use_this_ip_addr="127.0.0.1", node_count=tryNodes, base_port=base_port, java_heap_GB=1, timeoutSecs=15, retryDelaySecs=2) print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start)) h2o.verify_cloud_size() h2o.tear_down_cloud() # increment the base_port to avoid sticky ports when we do another # we only use two ports now? baseport += ports_per_node * tryNodes
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 2 localhost = h2o.decide_if_localhost() for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys" if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(2)
def test_import_billion_rows_parse_loop(self): print "Apparently we can't handle 1B rows .gzed" csvFilename = "billion_rows.csv.gz" importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename trialMax = 3 for tryHeap in [4,16]: print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \ "then loop parsing 'billion_rows.csv' to unique keys" h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap) timeoutSecs=800 for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again. ", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # sticky ports? h2o.tear_down_cloud() time.sleep(5)
def testAll(self): try: h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name=pytest-'+getpass.getuser(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests 'water.parser.ParserTest', ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def test_parse_nflx_loop_hdfs_fvec(self): print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz csvFilename = "file_10.dat.gz" csvFilepattern = "file_1[0-9].dat.gz" trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node='mr-0x6', hdfs_version='cdh4') timeoutSecs = 500 importFolderPath = "datasets/manyfiles-nflx-gz" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvFilePattern = 'file_1.dat.gz' # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_covtype20x_loop(self): csvFilename = "covtype20x.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryJvms in [1,2,3,4]: for tryHeap in [1,3]: print "\n", tryHeap,"GB heap,", tryJvms, "jvm per host, import folder,", \ "then loop parsing 'covtype20x.data' to unique keys" h2o_hosts.build_cloud_with_hosts(node_count=tryJvms, java_heap_GB=tryHeap) timeoutSecs=300 for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o removes key after parse now ## print "Removing", parseKey['source_key'] ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) # sticky ports? h2o.tear_down_cloud() time.sleep(tryJvms * 5)
def test_parse_covtype20x_loop_s3n_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = "standard" csvFilename = "covtype20x.data" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 500 trialMax = 3 for tryHeap in [4,12]: print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \ "then parse 'covtype20x.data'" h2o.init(java_heap_GB=tryHeap) # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): hex_key = csvFilename + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) removeKeyResult = h2o.nodes[0].remove_key(key=hex_key) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_import_covtype_parse_3jvm_fvec(self): h2o.beta_features = True csvFilename = "covtype.data" importFolderPath = "standard" trialMax = 2 for tryHeap in [1]: print "\n", tryHeap,"GB heap, 3 jvms, import folder, then loop parsing 'covtype.data' to unique keys" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=3, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=3, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(5) # print "Waiting 60 secs for TIME_WAIT sockets to go away" # time.sleep(60) time.sleep(2)
def test_parse_airline_multi_hdfs(self): csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def tearDownClass(cls): if h2o.clone_cloud_json == None: if ModelManagementTestCase.tear_down_cloud: h2o.tear_down_cloud() else: None else: h2o.check_sandbox_for_errors(sandboxIgnoreErrors=False, python_test_name="test_model_management")
def test_E_no_mc_snd_no_mc_rcv(self): print "\nwith flatfile, with multicast disabled" allAcceptIptables() multicastDropReceiveIptables() multicastBlockSendIptables() showIptables() h2o.init(nodes_per_host, use_flatfile=True) h2o.tear_down_cloud()
def test_import_nflx_parse_loop(self): print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz csvFilename = "file_10.dat.gz" csvFilepattern = "file_1[0-9].dat.gz" trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') else: h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandbox_ignore_errors = True timeoutSecs = 500 importFolderPath = "/datasets/manyfiles-nflx-gz" for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again importHdfsResult = h2i.setupImportHdfs(path=importFolderPath) hdfsFullList = importHdfsResult['succeeded'] for k in hdfsFullList: key = k['key'] # just print the first tile if 'nflx' in key and 'file_1.dat.gz' in key: # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key ### print "hdfsFullList:", h2o.dump_json(hdfsFullList) # error if none? self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?") key2 = csvFilename + "_" + str(trial) + ".hex" csvFilePattern = 'file_1.dat.gz' # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", time.sleep(5) print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern start = time.time() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print hdfsKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_nflx_loop_hdfs_fvec(self): h2o.beta_features = True print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def check_cloud_and_setup_next(): h2b.browseTheCloud() h2o.verify_cloud_size() h2o.check_sandbox_for_errors() print "Tearing down cloud of size", len(h2o.nodes) h2o.tear_down_cloud() h2o.clean_sandbox() # wait to make sure no sticky ports or anything os-related # so let's expand the delay if larger number of jvms # 1 second per node seems good h2o.verboseprint("Waiting", node_count, "seconds to avoid OS sticky port problem") time.sleep(node_count)
def testAll(self): try: h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name='+h2o.cloud_name(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests #'hex.GLMGridTest', 'hex.HistogramTest', 'hex.GLMTest', 'hex.KMeansTest', 'hex.MinorityClassTest', 'hex.rf.RandomForestTest', 'hex.rf.RFPredDomainTest', 'water.AtomicTest', 'water.AutoSerialTest', 'water.BitCmpTest', #'water.ConcurrentKeyTest.java', 'water.KeyToString', 'water.KVTest', #'water.KVSpeedTest', 'water.api.RStringTest', 'water.exec.ExprTest', 'water.exec.RBigDataTest', 'water.parser.DatasetCornerCasesTest', 'water.parser.ParseCompressedAndXLSTest', 'water.parser.ParseFolderTest', 'water.parser.ParseProgressTest', 'water.parser.ParserTest', 'water.parser.RReaderTest', 'water.score.ScorePmmlTest', 'water.score.ScoreTest' ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def test_import_covtype20x_parse_loop(self): csvFilename = "covtype20x.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 3 for tryHeap in [4,12]: print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \ "then parse 'covtype20x.data'" h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True timeoutSecs = 500 URI = "s3n://home-0xdiag-datasets" s3nKey = URI + "/" + csvFilename for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?") key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) print "Removing", s3nKey removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def testCloud(self): ports_per_node = 2 for tryNodes in range(2,8): sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o.init(use_this_ip_addr="127.0.0.1", node_count=tryNodes, timeoutSecs=30, retryDelaySecs=2, java_heap_GB=1) print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) h2o.verboseprint(h2o.nodes) h2o.verify_cloud_size() h2o.tear_down_cloud(h2o.nodes)
def test_Cloud(self): # FIX! weird timeout H2O exceptions with >8? maybe shouldn't # don't know if we care ports_per_node = 2 for tryNodes in range(2,17): h2o.verboseprint("Trying cloud of", tryNodes) sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o.build_cloud(tryNodes, retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1) print "Built cloud of %d in %d s" % (tryNodes, (time.time() - start)) h2o.verify_cloud_size() h2o.tear_down_cloud()
def test_Nuke(self): h2o.build_cloud(1) # wait 10 seconds for zombies to latch on? print "Waiting 10 secs for unknown number of possible zombies" time.sleep(10) c = h2o.nodes[0].get_cloud() cloudSize = c['cloud_size'] print "This node thought there was", cloudSize, "nodes in its cloud" # FIX! I added shutdown_all to LocalH2O so maybe this is now redundant h2o.nodes[0].shutdown_all() # this doesn't send a graceful shutdown? but should be tolerant of missing process? h2o.tear_down_cloud()
def test_import_covtype_parse_2jvm_fvec(self): csvFilename = "covtype.data" importFolderPath = "standard" trialMax = 2 for tryHeap in [1]: print "\n", tryHeap,"GB heap, 2 jvms, import folder, then loop parsing 'covtype.data' to unique keys" h2o.init(2, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(3)
def test_F_no_mc_loop(self): print "\nwith flatfile, with multicast disabled, and RF, 5 trials" allAcceptIptables() multicastDropReceiveIptables() showIptables() for x in range(1,5): h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10) h2o.tear_down_cloud() h2o.verboseprint("Waiting", nodes_per_host, "seconds to avoid OS sticky port problem") time.sleep(nodes_per_host) print "Trial", x sys.stdout.write('.') sys.stdout.flush()
def test_F_no_mc_loop(self): print "\nwith flatfile, with multicast disabled, and RF, 5 trials" allAcceptIptables() multicastDropReceiveIptables() showIptables() csvPathname = h2o.find_file('smalldata/poker/poker1000') for x in range(1,5): h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname) h2o.tear_down_cloud() h2o.verboseprint("Waiting", nodes_per_host, "seconds to avoid OS sticky port problem") time.sleep(nodes_per_host) print "Trial", x sys.stdout.write('.') sys.stdout.flush()
def test_parse_airline_multi_hdfs(self): h2o.beta_features = True csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: # why is 55609 already in use?? h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_Cloud(self): # FIX! weird timeout H2O exceptions with >8? maybe shouldn't # don't know if we care base_port = 54500 ports_per_node = 2 tryNodes = 5 for trial in range(10): h2o.verboseprint("Trying cloud of", tryNodes) sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o_hosts.build_cloud_with_hosts(tryNodes, base_port=base_port, retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1) print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start)) h2o.verify_cloud_size() h2o.tear_down_cloud()
def testCloud(self): baseport = 54300 ports_per_node = 3 for tryNodes in range(2,8): sys.stdout.write('.') sys.stdout.flush() start = time.time() h2o.build_cloud(use_this_ip_addr="127.0.0.1", base_port=baseport, node_count=tryNodes, timeoutSecs=30) print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) h2o.verboseprint(h2o.nodes) h2o.verify_cloud_size() h2o.tear_down_cloud(h2o.nodes) # increment the base_port to avoid sticky ports when we do another # we only use two ports now? baseport += ports_per_node * tryNodes
def test_import_nflx_parse_loop(self): print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz csvFilename = "file_10.dat.gz" csvFilepattern = "file_1[0-9].dat.gz" trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse" h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3u5') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True URI = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/datasets/manyfiles-nflx-gz" hdfsKey = URI + "/" + csvFilepattern timeoutSecs = 500 for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again importHdfsResult = h2o.nodes[0].import_hdfs(URI) hdfsFullList = importHdfsResult['succeeded'] for k in hdfsFullList: key = k['key'] # just print the first tile if 'nflx' in key and 'file_1.dat.gz' in key: # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key ### print "hdfsFullList:", h2o.dump_json(hdfsFullList) # error if none? self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?") key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading hdfs key: ", hdfsKey start = time.time() parseKey = h2o.nodes[0].parse(hdfsKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print hdfsKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from hdfs", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) for k in hdfsFullList: deleteKey = k['key'] if csvFilename in deleteKey and not ".hex" in key: print "Removing", deleteKey removeKeyResult = h2o.nodes[0].remove_key(key=deleteKey) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_B(self): print "\nwith flatfile, Build allowing anything" allAcceptIptables() showIptables() h2o.init(nodes_per_host, use_flatfile=True) h2o.tear_down_cloud()
def tearDownClass(cls): h2o.tear_down_cloud()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1 == 1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1 == 0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1 == 0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud( 2, java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts( base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder( None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] parseKey = h2i.parseImportFolderFile( None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response'][ 'time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect( parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def tearDownClass(cls): h2o.tear_down_cloud() allAcceptIptables() showIptables()
# column Cancelled 21 type: int # column CancellationCode 22 type: enum enum_domain_size: 5 num_missing_values: 38955823 # column Diverted 23 type: int # column CarrierDelay 24 type: int num_missing_values: 123534969 # column WeatherDelay 25 type: int num_missing_values: 123534969 # column NASDelay 26 type: int num_missing_values: 123534969 # column SecurityDelay 27 type: int num_missing_values: 123534969 # column LateAircraftDelay 28 type: int num_missing_values: 123534969 # column IsArrDelayed 29 type: enum enum_domain_size: 2 # column IsDepDelayed 30 type: enum enum_domain_size: 2 # run allstate run_glms(files['allstate'],[{'y':'Claim_Amount','lambda':l,'alpha':a,'family':'gaussian','n_folds':1} # for l in (1e-4,1e-5) # for a in (1.0,0.5,0.0)]) for l in [1e-4] for a in [0.5]]) # was: # x = '0,1,2,3,4,5,6,7,8,9,12,16,17,18' x = '0,1,2,3,5,7,8,9,16,17' # run airlines run_glms(files['airlines'],[{'y':'IsArrDelayed','x':x,'lambda':l,'alpha':a,'family':'gaussian','n_folds':1,'case':1} # for l in (0.035,0.025,1e-2,5e-3,1e-3,5e-4,1e-4,5e-5,1e-5,1e-8) # for a in (1.0,0.5,0.0)]) for l in [1e-4] for a in [0.5]]) h2o.tear_down_cloud()
def tearDownClass(cls): if not h2o.browse_disable: # time.sleep(500000) pass h2o.tear_down_cloud(h2o.nodes)
def test_A(self): print "\nno flatfile, Build allowing anything" allAcceptIptables() showIptables() h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=False) h2o.tear_down_cloud()
def tearDownClass(cls): # the node state is gone when we tear down the cloud, so pass the ignore here also. h2o.tear_down_cloud(sandbox_ignore_errors=True)
def tearDownClass(cls): ### time.sleep(3600) h2o.tear_down_cloud()
def test_benchmark_import(self): covtype200xSize = 15033863400 csvFilenameList = [ ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), ] trialMax = 1 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): h2o.init(2, java_heap_GB=tryHeap, enable_benchmark_log=True) for trial in range(trialMax): csvPathname = importFolderPath + "/" + csvFilepattern h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseResult) # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseResult['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRF takes the parseResult directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern #********************************************************************************** if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(54) # don't include the output column x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 54, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o.tear_down_cloud() sys.stdout.write('.') sys.stdout.flush()
def test_parse_10k_files(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv.gz" headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10 maxFilenum = 10000 for filenum in range(maxFilenum): rowData = rand_rowData() filePrefix = "%04d" % filenum csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData) avgFileSize = os.path.getsize(csvPathname) importFolderPath = SYNDATASETS_DIR print "\nimportFolderPath:", importFolderPath csvFilenameList = [ # try one thousand files first ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv", maxFilenum * avgFileSize, 1200), # try two thousand ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv", maxFilenum * avgFileSize, 1200), ] trialMax = 1 tryHeap = 4 DO_GLM = True noPoll = False benchmarkLogging = ['cpu','disk', 'iostats'] # , 'jstack' benchmarkLogging = ['cpu','disk'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): h2o.init(3,java_heap_GB=tryHeap, enable_benchmark_log=True) ### h2b.browseTheCloud() # don't let the config json redirect import folder to s3 or s3n, because # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state) print "This test creates files in syn_datasets for import folder\n" + \ "so h2o and python need to be same machine" h2o.nodes[0].redirect_import_folder_to_s3_path = False h2o.nodes[0].redirect_import_folder_to_s3n_path = False for trial in range(trialMax): # nice to have the list of what got imported, so we delete "just that" down below # doing this just so we can see what we import (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*") importFullList = importResult['files'] print "importFullList:", importFullList importFailList = importResult['fails'] print "importFailList:", importFailList print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(path=importFolderPath+"/*", hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print "Parse result['destination_key']:", parseResult['destination_key'] # BUG here? if not noPoll: h2o_cmd.get_columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) print "\n" + csvFilepattern #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: GLMkwargs = {'y': 0, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() if not h2o.localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def tearDownClass(cls): pool.close() # pool.join() h2o.tear_down_cloud()
def test_C_build_cloud_relaxed_1(self): for trials in range(1): h2o.init(1, java_heap_GB=1, conservative=False) h2o.verify_cloud_size() h2o.tear_down_cloud() time.sleep(5)
def tearDownClass(cls): # wait while I inspect things # time.sleep(1500) h2o.tear_down_cloud()
def tearDownClass(cls): ### h2o.sleep(800) h2o.tear_down_cloud()
def test_parse_airline_multi_hdfs_many(self): h2o.beta_features = True # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: h2o_hosts.build_cloud_with_hosts( java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600, disable_assertion=True, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def tearDownClass(cls): ## print "sleeping 3600" # h2o.sleep(3600) h2o.tear_down_cloud()
def tearDownClass(cls): # this is for safety after error, plus gets us the grep of stdout/stderr for errors h2o.tear_down_cloud()
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack', 'iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize / 2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize / 2), 7200), ([ "A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz" ], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap, "GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o_hosts.build_cloud_with_hosts( h2oPerNode, java_heap_GB=tryHeap, # java_extra_args=jea, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: if USE_S3: URI = protocol + "://" + bucket + "/" + csvFolder + "/" else: URI = protocol + "://" + bucket + "/" + csvFolder + "/" # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater( foundKeys, 8, "Didn't see more than 8 files in s3n?") s3nKey = csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs( pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y ]: x.remove(i) x = ",".join(map(str, x)) if DO_GLM: algo = 'GLM' GLMkwargs = { 'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = { 'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGridOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def tearDownClass(cls): # if we got here by time out exception waiting for a job, we should clear # all jobs, if we're leaving h2o cloud up, and going to run another test #h2o.cancelAllJobs() h2o.tear_down_cloud()