Example #1
0
 def test_B_slow_junit(self):
     h2o.tear_down_cloud()
     h2o.build_cloud(node_count=2)
     # we don't have the port or ip configuration here
     # that util/h2o.py does? Keep this in synch with spawn_h2o there.
     # also don't have --nosigar here?
     (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
             'java',
             '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
             '-Dh2o.arg.name='+h2o.cloud_name(),
             '-Dh2o.arg.ip='+h2o.get_ip_address(),
             '-ea', '-jar', h2o.find_file('target/h2o.jar'),
             '-mainClass', 'org.junit.runner.JUnitCore',
             # The tests
             'water.ConcurrentKeyTest',
             'hex.MinorityClassTest'
             ])
     # getting UDP receiver stack traces if we shut down quickly after Junit
     # may need to wait a little bit before shutdown?
     time.sleep(3)
     rc = ps.wait(None)
     out = file(stdout).read()
     err = file(stderr).read()
     if rc is None:
         ps.terminate()
         raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
     elif rc != 0:
         raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
 def test_D_no_mc_snd(self):
     print "\nwith flatfile, with multicast disabled"
     allAcceptIptables()
     multicastBlockSendIptables()
     showIptables()
     h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
     h2o.tear_down_cloud()
 def test_C_no_mc_rcv(self):
     print "\nwith flatfile, with multicast disabled"
     allAcceptIptables()
     multicastDropReceiveIptables()
     showIptables()
     h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
     h2o.tear_down_cloud()
Example #4
0
    def test_Cloud(self):
        base_port = 54300
        ports_per_node = 2
        for trials in range(0,5):
            for tryNodes in range(3,6):
                sys.stdout.write('.')
                sys.stdout.flush()

                start = time.time()
                # start by cleaning sandbox (in build_cloud). 
                # so nosetest works which doesn't do unit_main

                # done in build_cloud now
                ### h2o.write_flatfile(node_count=tryNodes, base_port=base_port)
                h2o.build_cloud(node_count=tryNodes, java_heap_GB=1,
                    timeoutSecs=30, retryDelaySecs=2, base_port=base_port, use_flatfile=True)
                print "loop %d: Build cloud of %d in %d s" % (trials, tryNodes, (time.time() - start)) 

                for i in range(2):
                    print "nodes report size: %s consensus: %s expected: %d." % h2o.verify_cloud_size()

                h2o.tear_down_cloud()
                # with so many jvms, wait for sticky ports to be freed up..slow os stuff?
                # changed, to increment the base_port, to avoid reuse immediately
                time.sleep(1)
                base_port += ports_per_node * tryNodes
Example #5
0
    def test_A_all_junit(self):
        try:
            h2o.build_cloud(node_count=2, java_heap_GB=3)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Xms3G',
                    '-Xmx3G',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name='+h2o.cloud_name(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-Dh2o.arg.port=54666',
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The all test suite
                    'water.suites.AllTestsSuite'
                   ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
Example #6
0
    def testCloud(self):
        baseport = 54300
        ports_per_node = 2

        print "\nTest was written because seeing a bigger cloud than we want sometimes"
        print "You'll see the problem in the cloud in the browser"
        print "\nWorks if real ip address used. fails with 127.0.0.1 (intermittent)"
        print "Builds cloud with 3, the extra being a non-127.0.0.1 node (the real ip)"
        print "Eventually it goes away, around 1 minute?"
        for trial in range(20):
            for tryNodes in range(2,3):
                sys.stdout.write('.')
                sys.stdout.flush()

                start = time.time()
                ### this works
                ### h2o.build_cloud(use_this_ip_addr="192.168.0.37",
                # this intermittently fails
                h2o.build_cloud(use_this_ip_addr="127.0.0.1", 
                    node_count=tryNodes, base_port=base_port, java_heap_GB=1,
                    timeoutSecs=15, retryDelaySecs=2)
                print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start)) 

                h2o.verify_cloud_size()
                h2o.tear_down_cloud()

                # increment the base_port to avoid sticky ports when we do another
                # we only use two ports now?
                baseport += ports_per_node * tryNodes
    def test_import_covtype_parse_loop(self):
        csvFilename = "covtype.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        trialMax = 2
        localhost = h2o.decide_if_localhost()
        for tryHeap in [4, 3, 2, 1]:
            print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                h2i.setupImportFolder(None, importFolderPath)
                key2 = csvFilename + "_" + str(trial) + ".hex"
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(2)
Example #8
0
    def test_import_billion_rows_parse_loop(self):
        print "Apparently we can't handle 1B rows .gzed"
        csvFilename = "billion_rows.csv.gz"
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        trialMax = 3
        for tryHeap in [4,16]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \
                "then loop parsing 'billion_rows.csv' to unique keys"
            h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap)
            timeoutSecs=800
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                start = time.time()
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, 
                    timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60)
                elapsed = time.time() - start
                print "Trial #", trial, "completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again. ", \
                      "Otherwise it would just parse the cached key."
                storeView = h2o.nodes[0].store_view()
                ### print "storeView:", h2o.dump_json(storeView)

            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(5)
Example #9
0
    def testAll(self):
        try:
            h2o.build_cloud(node_count=2)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name=pytest-'+getpass.getuser(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The tests
                    'water.parser.ParserTest',
                    ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
    def test_parse_nflx_loop_hdfs_fvec(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node='mr-0x6', hdfs_version='cdh4')

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_covtype20x_loop(self):
        csvFilename = "covtype20x.data"
        importFolderPath = "/home/0xdiag/datasets"
        trialMax = 2
        for tryJvms in [1,2,3,4]:
            for tryHeap in [1,3]:
                print "\n", tryHeap,"GB heap,", tryJvms, "jvm per host, import folder,", \
                    "then loop parsing 'covtype20x.data' to unique keys"
                h2o_hosts.build_cloud_with_hosts(node_count=tryJvms, java_heap_GB=tryHeap)
                timeoutSecs=300
                for trial in range(trialMax):
                    # since we delete the key, we have to re-import every iteration, to get it again
                    h2i.setupImportFolder(None, importFolderPath)

                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    start = time.time()
                    parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, 
                        timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60)
                    elapsed = time.time() - start
                    print "Trial #", trial, "completed in", elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # h2o removes key after parse now
                    ## print "Removing", parseKey['source_key']
                    ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key'])
                    ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

                # sticky ports?
                h2o.tear_down_cloud()
                time.sleep(tryJvms * 5)
    def test_parse_covtype20x_loop_s3n_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = "standard"
        csvFilename = "covtype20x.data"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 500
        trialMax = 3
        for tryHeap in [4,12]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \
                "then parse 'covtype20x.data'"
            h2o.init(java_heap_GB=tryHeap)
            # don't raise exception if we find something bad in h2o stdout/stderr?
            h2o.nodes[0].sandboxIgnoreErrors = True

            for trial in range(trialMax):
                hex_key = csvFilename + ".hex"
                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start
                print "parse result:", parseResult['destination_key']
                print "Trial #", trial, "completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                removeKeyResult = h2o.nodes[0].remove_key(key=hex_key)

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_import_covtype_parse_3jvm_fvec(self):
        h2o.beta_features = True
        csvFilename = "covtype.data"
        importFolderPath = "standard"
        trialMax = 2
        for tryHeap in [1]:
            print "\n", tryHeap,"GB heap, 3 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=3, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=3, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                csvPathname = importFolderPath + "/" + csvFilename
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(5)

        # print "Waiting 60 secs for TIME_WAIT sockets to go away"
        # time.sleep(60)
        time.sleep(2)
    def test_parse_airline_multi_hdfs(self):
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Example #15
0
 def tearDownClass(cls):
     if h2o.clone_cloud_json == None:
         if ModelManagementTestCase.tear_down_cloud:
             h2o.tear_down_cloud()
         else:
             None
     else:
         h2o.check_sandbox_for_errors(sandboxIgnoreErrors=False, python_test_name="test_model_management")
 def test_E_no_mc_snd_no_mc_rcv(self):
     print "\nwith flatfile, with multicast disabled"
     allAcceptIptables()
     multicastDropReceiveIptables()
     multicastBlockSendIptables()
     showIptables()
     h2o.init(nodes_per_host, use_flatfile=True)
     h2o.tear_down_cloud()
    def test_import_nflx_parse_loop(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandbox_ignore_errors = True

            timeoutSecs = 500
            importFolderPath = "/datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                importHdfsResult = h2i.setupImportHdfs(path=importFolderPath)
                hdfsFullList = importHdfsResult['succeeded']
                for k in hdfsFullList:
                    key = k['key']
                    # just print the first tile
                    if 'nflx' in key and 'file_1.dat.gz' in key: 
                        # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                        print "example file we'll use:", key

                ### print "hdfsFullList:", h2o.dump_json(hdfsFullList)
                # error if none? 
                self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                time.sleep(5)
                print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath,
                    key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print hdfsKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname, schema='hdfs',
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"
    
                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"
                
                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Example #19
0
def check_cloud_and_setup_next():
    h2b.browseTheCloud()
    h2o.verify_cloud_size()
    h2o.check_sandbox_for_errors()
    print "Tearing down cloud of size", len(h2o.nodes)
    h2o.tear_down_cloud()
    h2o.clean_sandbox()
    # wait to make sure no sticky ports or anything os-related
    # so let's expand the delay if larger number of jvms
    # 1 second per node seems good
    h2o.verboseprint("Waiting", node_count, "seconds to avoid OS sticky port problem")
    time.sleep(node_count)
Example #20
0
    def testAll(self):
        try:
            h2o.build_cloud(node_count=2)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name='+h2o.cloud_name(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The tests
                    #'hex.GLMGridTest',
                    'hex.HistogramTest',
                    'hex.GLMTest',
                    'hex.KMeansTest',
                    'hex.MinorityClassTest',
                    'hex.rf.RandomForestTest',
                    'hex.rf.RFPredDomainTest',
                    'water.AtomicTest',
                    'water.AutoSerialTest',
                    'water.BitCmpTest',
                    #'water.ConcurrentKeyTest.java',
                    'water.KeyToString',
                    'water.KVTest',
                    #'water.KVSpeedTest',
                    'water.api.RStringTest',
                    'water.exec.ExprTest',
                    'water.exec.RBigDataTest',
                    'water.parser.DatasetCornerCasesTest',
                    'water.parser.ParseCompressedAndXLSTest',
                    'water.parser.ParseFolderTest',
                    'water.parser.ParseProgressTest',
                    'water.parser.ParserTest',
                    'water.parser.RReaderTest',
                    'water.score.ScorePmmlTest',
                    'water.score.ScoreTest'
                    ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
    def test_import_covtype20x_parse_loop(self):
        csvFilename = "covtype20x.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        trialMax = 3
        for tryHeap in [4,12]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \
                "then parse 'covtype20x.data'"
            h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
                # this is for our amazon ec hdfs
                # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
                hdfs_name_node='10.78.14.235:9000',
                hdfs_version='0.20.2')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            h2o.nodes[0].sandbox_ignore_errors = True

            timeoutSecs = 500
            URI = "s3n://home-0xdiag-datasets"
            s3nKey = URI + "/" + csvFilename
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                # s3n URI thru HDFS is not typical.
                importHDFSResult = h2o.nodes[0].import_hdfs(URI)
                s3nFullList = importHDFSResult['succeeded']
                ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                # error if none? 
                self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                print "Loading s3n key: ", s3nKey, 'thru HDFS'
                start = time.time()
                parseKey = h2o.nodes[0].parse(s3nKey, key2,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print s3nKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Trial #", trial, "completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                      "Otherwise it would just parse the cached key."
                storeView = h2o.nodes[0].store_view()
                ### print "storeView:", h2o.dump_json(storeView)
                print "Removing", s3nKey
                removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)
                ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Example #22
0
    def testCloud(self):
        ports_per_node = 2
        for tryNodes in range(2,8):
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o.init(use_this_ip_addr="127.0.0.1", node_count=tryNodes, timeoutSecs=30, retryDelaySecs=2, java_heap_GB=1)
            print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) 

            h2o.verboseprint(h2o.nodes)
            h2o.verify_cloud_size()
            h2o.tear_down_cloud(h2o.nodes)
Example #23
0
    def test_Cloud(self):
        # FIX! weird timeout H2O exceptions with >8? maybe shouldn't
        # don't know if we care
        ports_per_node = 2
        for tryNodes in range(2,17):
            h2o.verboseprint("Trying cloud of", tryNodes)
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o.build_cloud(tryNodes, retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1)
            print "Built cloud of %d in %d s" % (tryNodes, (time.time() - start)) 
            h2o.verify_cloud_size()
            h2o.tear_down_cloud()
    def test_Nuke(self):
        h2o.build_cloud(1)
        # wait 10 seconds for zombies to latch on?
        print "Waiting 10 secs for unknown number of possible zombies"
        time.sleep(10)

        c = h2o.nodes[0].get_cloud()
        cloudSize = c['cloud_size']
        print "This node thought there was", cloudSize, "nodes in its cloud"

        # FIX! I added shutdown_all to LocalH2O so maybe this is now redundant
        h2o.nodes[0].shutdown_all()

        # this doesn't send a graceful shutdown? but should be tolerant of missing process?
        h2o.tear_down_cloud()
    def test_import_covtype_parse_2jvm_fvec(self):
        csvFilename = "covtype.data"
        importFolderPath = "standard"
        trialMax = 2
        for tryHeap in [1]:
            print "\n", tryHeap,"GB heap, 2 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            h2o.init(2, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                csvPathname = importFolderPath + "/" + csvFilename
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(3)
    def test_F_no_mc_loop(self):
        print "\nwith flatfile, with multicast disabled, and RF, 5 trials"
        allAcceptIptables()
        multicastDropReceiveIptables()
        showIptables()

        for x in range(1,5):
            h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put')
            h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10)
            h2o.tear_down_cloud()
            h2o.verboseprint("Waiting", nodes_per_host,
                "seconds to avoid OS sticky port problem")
            time.sleep(nodes_per_host)
            print "Trial", x
            sys.stdout.write('.')
            sys.stdout.flush()
    def test_F_no_mc_loop(self):
        print "\nwith flatfile, with multicast disabled, and RF, 5 trials"
        allAcceptIptables()
        multicastDropReceiveIptables()
        showIptables()
        csvPathname = h2o.find_file('smalldata/poker/poker1000')

        for x in range(1,5):
            h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
            h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
            h2o.tear_down_cloud()
            h2o.verboseprint("Waiting", nodes_per_host,
                "seconds to avoid OS sticky port problem")
            time.sleep(nodes_per_host)
            print "Trial", x
            sys.stdout.write('.')
            sys.stdout.flush()
    def test_parse_airline_multi_hdfs(self):
        h2o.beta_features = True
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                # why is 55609 already in use?? 
                h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Example #29
0
    def test_Cloud(self):
        # FIX! weird timeout H2O exceptions with >8? maybe shouldn't
        # don't know if we care
        base_port = 54500
        ports_per_node = 2
        tryNodes = 5
        for trial in range(10):
            h2o.verboseprint("Trying cloud of", tryNodes)
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o_hosts.build_cloud_with_hosts(tryNodes, base_port=base_port, 
                retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1)
            print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start))

            h2o.verify_cloud_size()
            h2o.tear_down_cloud()
Example #30
0
    def testCloud(self):
        baseport = 54300
        ports_per_node = 3
        for tryNodes in range(2,8):
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o.build_cloud(use_this_ip_addr="127.0.0.1", base_port=baseport, node_count=tryNodes, timeoutSecs=30)
            print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) 

            h2o.verboseprint(h2o.nodes)
            h2o.verify_cloud_size()
            h2o.tear_down_cloud(h2o.nodes)

            # increment the base_port to avoid sticky ports when we do another
            # we only use two ports now?
            baseport += ports_per_node * tryNodes
Example #31
0
    def test_import_nflx_parse_loop(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                use_hdfs=True,
                hdfs_name_node='192.168.1.176',
                hdfs_version='cdh3u5')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            h2o.nodes[0].sandbox_ignore_errors = True
            URI = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/datasets/manyfiles-nflx-gz"
            hdfsKey = URI + "/" + csvFilepattern

            timeoutSecs = 500
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                importHdfsResult = h2o.nodes[0].import_hdfs(URI)
                hdfsFullList = importHdfsResult['succeeded']
                for k in hdfsFullList:
                    key = k['key']
                    # just print the first tile
                    if 'nflx' in key and 'file_1.dat.gz' in key: 
                        # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                        print "example file we'll use:", key

                ### print "hdfsFullList:", h2o.dump_json(hdfsFullList)
                # error if none? 
                self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                print "Loading hdfs key: ", hdfsKey
                start = time.time()
                parseKey = h2o.nodes[0].parse(hdfsKey, key2,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print hdfsKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Deleting key in H2O so we get it from hdfs", \
                      "Otherwise it would just parse the cached key."

                storeView = h2o.nodes[0].store_view()
                ### print "storeView:", h2o.dump_json(storeView)
                # "key": "hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                # have to do the pattern match ourself, to figure out what keys to delete
                # we're deleting the keys in the initial import. We leave the keys we created
                # by the parse. We use unique dest keys for those, so no worries.
                # Leaving them is good because things fill up! (spill)
                for k in hdfsFullList:
                    deleteKey = k['key']
                    if csvFilename in deleteKey and not ".hex" in key: 
                        print "Removing", deleteKey
                        removeKeyResult = h2o.nodes[0].remove_key(key=deleteKey)
                        ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Example #32
0
 def test_B(self):
     print "\nwith flatfile, Build allowing anything"
     allAcceptIptables()
     showIptables()
     h2o.init(nodes_per_host, use_flatfile=True)
     h2o.tear_down_cloud()
Example #33
0
 def tearDownClass(cls):
     h2o.tear_down_cloud()
Example #34
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize = 183
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800),
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz",
                 100 * avgMichalSize, 1800),
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz",
                 200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1 == 1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600),
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600),
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800),
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                # for now, take too long on 2x100GB heap on 164
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz",
                 50 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/standard'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz",
                 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz",
                 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz",
                 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz",
                 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz",
                 "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz",
                 "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz",
                 "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz",
                 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(
                    2,
                    java_heap_GB=tryHeap,
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                    importFailList)
                # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilepattern,
                    importFolderPath,
                    key2=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i + 2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB,
                        csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response'][
                    'time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(
                        parseKey['destination_key'],
                        exceptionOnMissingValues=False)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542)  # don't include the output column
                    # remove the output too! (378)
                    for i in [
                            3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19,
                            20, 424, 425, 426, 540, 541, 378
                    ]:
                        x.remove(i)
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 378,
                        'case': 15,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                             timeoutSecs=timeoutSecs,
                                             **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB,
                        csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
Example #35
0
 def tearDownClass(cls):
     h2o.tear_down_cloud()
     allAcceptIptables()
     showIptables()
Example #36
0
    # column Cancelled 21 type: int
    # column CancellationCode 22 type: enum enum_domain_size: 5 num_missing_values: 38955823
    # column Diverted 23 type: int
    # column CarrierDelay 24 type: int num_missing_values: 123534969
    # column WeatherDelay 25 type: int num_missing_values: 123534969
    # column NASDelay 26 type: int num_missing_values: 123534969
    # column SecurityDelay 27 type: int num_missing_values: 123534969
    # column LateAircraftDelay 28 type: int num_missing_values: 123534969
    # column IsArrDelayed 29 type: enum enum_domain_size: 2
    # column IsDepDelayed 30 type: enum enum_domain_size: 2

    # run allstate
    run_glms(files['allstate'],[{'y':'Claim_Amount','lambda':l,'alpha':a,'family':'gaussian','n_folds':1} 
                # for l in (1e-4,1e-5) 
                # for a in (1.0,0.5,0.0)])
                for l in [1e-4]
                for a in [0.5]])
    # was:
    # x = '0,1,2,3,4,5,6,7,8,9,12,16,17,18'
    x = '0,1,2,3,5,7,8,9,16,17'
    # run airlines
    run_glms(files['airlines'],[{'y':'IsArrDelayed','x':x,'lambda':l,'alpha':a,'family':'gaussian','n_folds':1,'case':1}
                # for l in (0.035,0.025,1e-2,5e-3,1e-3,5e-4,1e-4,5e-5,1e-5,1e-8)
                # for a in (1.0,0.5,0.0)])
                for l in [1e-4]
                for a in [0.5]])

    h2o.tear_down_cloud()


Example #37
0
    def tearDownClass(cls):
        if not h2o.browse_disable:
            # time.sleep(500000)
            pass

        h2o.tear_down_cloud(h2o.nodes)
 def test_A(self):
     print "\nno flatfile, Build allowing anything"
     allAcceptIptables()
     showIptables()
     h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=False)
     h2o.tear_down_cloud()
 def tearDownClass(cls):
     # the node state is gone when we tear down the cloud, so pass the ignore here also.
     h2o.tear_down_cloud(sandbox_ignore_errors=True)
Example #40
0
 def tearDownClass(cls):
     ### time.sleep(3600)
     h2o.tear_down_cloud()
Example #41
0
    def test_benchmark_import(self):
        covtype200xSize = 15033863400

        csvFilenameList = [
            ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),
        ]

        trialMax = 1
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            h2o.init(2, java_heap_GB=tryHeap, enable_benchmark_log=True)

            for trial in range(trialMax):
                csvPathname = importFolderPath + "/" + csvFilepattern

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(
                    bucket=bucket,
                    path=csvPathname,
                    schema='local',
                    hex_key=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print "Parse result['destination_key']:", parseResult[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseResult)

                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRF takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern

                #**********************************************************************************
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(54)  # don't include the output column
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 54,
                        'case': 1,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                h2o_cmd.checkKeyDistribution()
                h2o.tear_down_cloud()

                sys.stdout.write('.')
                sys.stdout.flush()
    def test_parse_10k_files(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn.csv.gz"
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10
        maxFilenum = 10000
        for filenum in range(maxFilenum):
            rowData = rand_rowData()
            filePrefix = "%04d" % filenum
            csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename
            write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData)

        avgFileSize = os.path.getsize(csvPathname)

        importFolderPath = SYNDATASETS_DIR
        print "\nimportFolderPath:", importFolderPath
        csvFilenameList = [
            # try one thousand files first
            ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv", maxFilenum * avgFileSize, 1200),
            # try two thousand
            ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv", maxFilenum * avgFileSize, 1200),
            ]

        trialMax = 1
        tryHeap = 4
        DO_GLM = True
        noPoll = False
        benchmarkLogging = ['cpu','disk', 'iostats'] # , 'jstack'
        benchmarkLogging = ['cpu','disk']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            h2o.init(3,java_heap_GB=tryHeap, enable_benchmark_log=True)
            ### h2b.browseTheCloud()

            # don't let the config json redirect import folder to s3 or s3n, because
            # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state)
            print "This test creates files in syn_datasets for import folder\n" + \
                "so h2o and python need to be same machine"
            h2o.nodes[0].redirect_import_folder_to_s3_path = False
            h2o.nodes[0].redirect_import_folder_to_s3n_path = False

            for trial in range(trialMax):
                # nice to have the list of what got imported, so we delete "just that" down below
                # doing this just so we can see what we import
                (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                importFullList = importResult['files']
                print "importFullList:", importFullList
                importFailList = importResult['fails']
                print "importFailList:", importFailList
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseResult = h2i.import_parse(path=importFolderPath+"/*",
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print "Parse result['destination_key']:", parseResult['destination_key']

                # BUG here?
                if not noPoll:
                    h2o_cmd.get_columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
                        
                print "\n" + csvFilepattern

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    GLMkwargs = {'y': 0, 'case': 1, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                h2o.tear_down_cloud()
                if not h2o.localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Example #43
0
 def tearDownClass(cls):
     pool.close()
     # pool.join()
     h2o.tear_down_cloud()
Example #44
0
 def test_C_build_cloud_relaxed_1(self):
     for trials in range(1):
         h2o.init(1, java_heap_GB=1, conservative=False)
         h2o.verify_cloud_size()
         h2o.tear_down_cloud()
         time.sleep(5)
Example #45
0
 def tearDownClass(cls):
     # wait while I inspect things
     # time.sleep(1500)
     h2o.tear_down_cloud()
Example #46
0
 def tearDownClass(cls):
     ### h2o.sleep(800)
     h2o.tear_down_cloud()
    def test_parse_airline_multi_hdfs_many(self):

        h2o.beta_features = True
        # default
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap,
                                random_udp_drop=RANDOM_UDP_DROP,
                                base_port=55930,
                                use_hdfs=True,
                                hdfs_name_node=NAME_NODE,
                                hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(
                    java_heap_GB=tryHeap,
                    random_udp_drop=RANDOM_UDP_DROP,
                    base_port=55600,
                    disable_assertion=True,
                    use_hdfs=True,
                    hdfs_name_node=NAME_NODE,
                    hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='hdfs',
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"

                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"

                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern,
                                             hex_key=hex_key,
                                             noPoll=True,
                                             delete_on_done=0,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
 def tearDownClass(cls):
     ## print "sleeping 3600"
     # h2o.sleep(3600)
     h2o.tear_down_cloud()
Example #49
0
 def tearDownClass(cls):
     # this is for safety after error, plus gets us the grep of stdout/stderr for errors
     h2o.tear_down_cloud()
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = True
        DO_GLMGRID = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack', 'iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        csvFilenameList = [
            (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz",
             "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[1-2][0-5][0-9].dat.gz",
             "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[1-2][0-6][0-9].dat.gz",
             "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[1-2][0-7][0-9].dat.gz",
             "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[1-2][0-8][0-9].dat.gz",
             "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[12][0-9][0-9].dat.gz",
             "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz",
             "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz",
             "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_[123][0-9][0-9].dat.gz",
             "file_300_C.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_1.dat.gz", "file_1.dat.gz",
             1 * avgMichalSize, 300),
            (["manyfiles-nflx-gz"], "file_[2][0-9].dat.gz", "file_10.dat.gz",
             10 * avgMichalSize, 700),
            (["manyfiles-nflx-gz"], "file_[34][0-9].dat.gz", "file_20.dat.gz",
             20 * avgMichalSize, 900),
            (["manyfiles-nflx-gz"], "file_[5-9][0-9].dat.gz",
             "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_1[0-4][0-9].dat.gz",
             "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_1[0-9][0-9].dat.gz",
             "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "file_2[0-9][0-9].dat.gz",
             "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
            # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use
            (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz",
             "file_A_200_x55.dat.gz", 200 * (avgMichalSize / 2), 7200),
            (["A-800-manyfiles-nflx-gz",
              "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz",
             "file_A_400_x55.dat.gz", 400 * (avgMichalSize / 2), 7200),
            ([
                "A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz",
                "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"
            ], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz",
             800 * (avgMichalSize / 2), 7200),
        ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):

            bucket = "home-0xdiag-datasets"
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [28]:
                if USE_S3:
                    protocol = "s3"
                else:
                    protocol = "s3n"
                print "\n", tryHeap, "GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"

                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                # jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o_hosts.build_cloud_with_hosts(
                    h2oPerNode,
                    java_heap_GB=tryHeap,
                    # java_extra_args=jea,
                    enable_benchmark_log=True,
                    timeoutSecs=120,
                    retryDelaySecs=10,
                    # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
                    # this is for our amazon ec hdfs
                    # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
                    hdfs_name_node='10.78.14.235:9000',
                    hdfs_version='0.20.2')

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandbox_ignore_errors = True

                for trial in range(trialMax):
                    # import a list of folders, one at a time (hdfs import can't take pattern match
                    # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket
                    # too slow
                    for csvFolder in csvFolderList:
                        if USE_S3:
                            URI = protocol + "://" + bucket + "/" + csvFolder + "/"
                        else:
                            URI = protocol + "://" + bucket + "/" + csvFolder + "/"

                        # since we delete the key, we have to re-import every iteration, to get it again
                        # s3n URI thru HDFS is not typical.
                        if USE_S3:
                            importResult = h2o.nodes[0].import_s3(bucket)
                        else:
                            importResult = h2o.nodes[0].import_hdfs(URI)

                        foundKeys = 0
                        for s in importResult['succeeded']:
                            # just print the first tile
                            # if 'nflx' in key and 'file_1.dat.gz' in key:
                            if csvFilepattern in s['key']:
                                # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                                print "example file we'll use:", s['key']
                                break
                            else:
                                pass
                            foundKeys += 1

                        ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                        # error if none?
                        self.assertGreater(
                            foundKeys, 8,
                            "Didn't see more than 8 files in s3n?")

                    s3nKey = csvFilepattern
                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", s3nKey, "to", key2
                    start = time.time()
                    parseKey = h2o.nodes[0].parse(
                        s3nKey,
                        key2,
                        timeoutSecs=timeoutSecs,
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i + 1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2,
                             timeoutSecs) = csvFilenameList[i + 1]
                            s3nKey = URI + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse2Key = h2o.nodes[0].parse(
                                s3nKey,
                                key2,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i + 2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3,
                             timeoutSecs) = csvFilenameList[i + 2]
                            s3nKey = URI + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse3Key = h2o.nodes[0].parse(
                                s3nKey,
                                key2,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print s3nKey, 'parse time:', parseKey['response']['time']
                    print "parse result:", parseKey['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(
                            pattern=csvFilename,
                            timeoutSecs=timeoutSecs,
                            benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes / 1e6) / elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern,
                            csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    y = 378
                    if not noPoll:
                        x = h2o_glm.goodXFromColumnInfo(
                            y,
                            key=parseKey['destination_key'],
                            timeoutSecs=300)

                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542)  # don't include the output column
                        # remove the output too! (378)
                        for i in [
                                3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18,
                                19, 20, 424, 425, 426, 540, 541, y
                        ]:
                            x.remove(i)
                        x = ",".join(map(str, x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {
                                'x': x,
                                'y': y,
                                'case': 15,
                                'case_mode': '>',
                                'family': 'binomial',
                                'max_iter': 10,
                                'n_folds': 1,
                                'alpha': 0.2,
                                'lambda': 1e-5
                            }
                            start = time.time()
                            glm = h2o_cmd.runGLMOnly(
                                parseKey=parseKey,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging,
                                **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None,
                                                   **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {
                                'x': x,
                                'y': y,
                                'case': 15,
                                'case_mode': '>',
                                'family': 'binomial',
                                'max_iter': 10,
                                'n_folds': 1,
                                'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                            }
                            start = time.time()
                            glm = h2o_cmd.runGLMGridOnly(
                                parseKey=parseKey,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging,
                                **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None,
                                                       **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern,
                            csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.checkKeyDistribution()
                    h2o_cmd.deleteCsvKey(csvFilename, importResult)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
Example #51
0
 def tearDownClass(cls):
     # if we got here by time out exception waiting for a job, we should clear
     # all jobs, if we're leaving h2o cloud up, and going to run another test
     #h2o.cancelAllJobs() 
     h2o.tear_down_cloud()