Beispiel #1
0
    def test_parse_cust(self):
        # run as user 0xcustomer to get access (with .json config and ssh key file specified)
        importFolderPath = '/mnt/0xcustomer-datasets'
        pollTimeoutSecs = 120
        retryDelaySecs = 30
        timeoutSecs = 300
        
        (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*")
        importFileList = importResult['files']
        importFailList = importResult['fails']
        importKeyList = importResult['keys']
        importDelList = importResult['dels']

        if len(importDelList)!=0:
            raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList))

        if len(importFileList)<MINFILES:
            raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList))

        if len(importKeyList)<MINFILES:
            raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList))

        if len(importFailList)!=0:
            raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList))


        # only parse files with .csv or .tsv in their name (no dirs like that?)
        goodKeyList = [key for key in importKeyList if ('.csv' in key  or '.tsv' in key)]
        trial = 0
        # just do 1?
        for i, importKey in enumerate(random.sample(goodKeyList,3)):
            print "importKey:", importKey
            trial +=1

            start = time.time() 
            # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like
            # force header=0..should mean headers get treated as NAs
            parseResult = h2i.parse_only(pattern=importKey, header=0,
                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            origKey = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=origKey)
            h2o_cmd.infoFromInspect(inspect, origKey)

            execExpr = 'newKey = '+origKey+'[1,1]'
            h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
            newParseKey = {'destination_key': 'newKey'}

            h2o_cmd.checkKeyDistribution()
            h2o.nodes[0].remove_key(key=origKey)
            # a key isn't created for a scalar
            # h2o.nodes[0].remove_key(key='newKey')
        
        self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
Beispiel #2
0
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 

if __name__ == '__main__':
    h2o.unit_main()
Beispiel #3
0
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = True
        DO_GLMGRID = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack','iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        csvFilenameList = [
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300),
            (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
            (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900),
            (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
            # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use    
            (["A-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
        ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):

            bucket = "home-0xdiag-datasets"
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [28]:
                if USE_S3:
                    protocol = "s3"
                else:
                    protocol = "s3n"
                print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"
                
                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                # jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10)
                # java_extra_args=jea,

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandboxIgnoreErrors = True

                for trial in range(trialMax):
                    # import a list of folders, one at a time (hdfs import can't take pattern match
                    # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket
                    # too slow
                    for csvFolder in csvFolderList:
                        # since we delete the key, we have to re-import every iteration, to get it again
                        # s3n URI thru HDFS is not typical.
                        if USE_S3:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3')
                        else:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs')

                        foundKeys = 0
                        for s in importResult['succeeded']:
                            # just print the first tile
                            # if 'nflx' in key and 'file_1.dat.gz' in key: 
                            if csvFilepattern in s['key']:
                                # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                                print "example file we'll use:", s['key']
                                break
                            else:
                                pass
                            foundKeys += 1

                        ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                        # error if none? 
                        self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?")

                    src_key = csvFilepattern
                    hex_key = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", src_key, "to", hex_key
                    start = time.time()
                    parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                        timeoutSecs=timeoutSecs, 
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i+1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                            src_key = csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i+2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                            src_key = URI + csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs, 
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print "parse result:", parseResult['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(pattern=csvFilename, 
                            timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes/1e6)/elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    y = 378
                    if not noPoll:
                        x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)


                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542) # don't include the output column
                        # remove the output too! (378)
                        for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]:
                            x.remove(i)
                        x = ",".join(map(str,x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                            start = time.time()
                            glm = h2o_cmd.runGLM(parseResult=parseResult, 
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                                }
                            start = time.time()
                            glm = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.checkKeyDistribution()
                    h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
Beispiel #4
0
    def sub_c3_fvec_long(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140
        bucket = "home-0xdiag-datasets"
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = "manyfiles-nflx-gz"
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes) == 1:
            csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)]
        else:
            csvFilenameList = [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key=csvFilename + ".hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]:
                    x.remove(i)
                    ignore_x.append(i)
                x.remove(378)

                # add one since we are no longer 0 based offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 4,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult["destination_key"]
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                        x.remove(i)
                        ignore_x.append(i)
                    x.remove(378)

                    # add one since we are no longer 0 based offset
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Beispiel #6
0
    def sub_c2_rel_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140 
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes)==1:
            csvFilenameList= [
                ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600),
            ]
        else:
            csvFilenameList= [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]:
                        x.remove(i)
                        ignore_x.append(i)

                    # increment by one, because we are no long zero offset!
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'family': 'binomial',
                        'x': x,
                        'y': 'C379', 
                        'case': 15, 
                        'case_mode': '>',
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Beispiel #7
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            # double import still causing problems?
            # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            # importFullList = importResult['files']
            # importFailList = importResult['fails']
            # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename +
                                     " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs,
                                           benchmarkLogging=benchmarkLogging)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541
                ]:
                    x.remove(i)
                    ignore_x.append(i)

                # plus 1 because we are no longer 0 offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    'ignored_cols': ignore_x,
                    'family': 'binomial',
                    'response': 'C379',
                    'max_iter': 4,
                    'n_folds': 1,
                    'family': 'binomial',
                    'alpha': 0.2,
                    'lambda': 1e-5
                }

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                aHack = {'destination_key': "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
Beispiel #8
0
    def test_parse_10k_files(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn.csv.gz"
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10
        maxFilenum = 10000
        for filenum in range(maxFilenum):
            rowData = rand_rowData()
            filePrefix = "%04d" % filenum
            csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename
            write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData)

        avgFileSize = os.path.getsize(csvPathname)

        importFolderPath = SYNDATASETS_DIR
        print "\nimportFolderPath:", importFolderPath
        csvFilenameList = [
            # try one thousand files first
            ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv",
             maxFilenum * avgFileSize, 1200),
            # try two thousand
            ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv",
             maxFilenum * avgFileSize, 1200),
        ]

        trialMax = 1
        base_port = 54321
        tryHeap = 4
        DO_GLM = True
        noPoll = False
        benchmarkLogging = ['cpu', 'disk', 'iostats']  # , 'jstack'
        benchmarkLogging = ['cpu', 'disk']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(3,
                                java_heap_GB=tryHeap,
                                base_port=base_port,
                                enable_benchmark_log=True)
            else:
                h2o_hosts.build_cloud_with_hosts(1,
                                                 java_heap_GB=tryHeap,
                                                 base_port=base_port,
                                                 enable_benchmark_log=True)
            ### h2b.browseTheCloud()

            # don't let the config json redirect import folder to s3 or s3n, because
            # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state)
            print "This test creates files in syn_datasets for import folder\n" + \
                "so h2o and python need to be same machine"
            h2o.nodes[0].redirect_import_folder_to_s3_path = False
            h2o.nodes[0].redirect_import_folder_to_s3n_path = False

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)
                importFullList = importFolderResult['files']
                print "importFullList:", importFullList
                importFailList = importFolderResult['fails']
                print "importFailList:", importFailList
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                    importFailList)

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilepattern,
                    importFolderPath,
                    key2=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response'][
                    'time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    h2o_cmd.get_columnInfoFromInspect(
                        parseKey['destination_key'],
                        exceptionOnMissingValues=True)

                print "\n" + csvFilepattern

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    GLMkwargs = {
                        'y': 0,
                        'case': 1,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                             timeoutSecs=timeoutSecs,
                                             **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)

                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
Beispiel #9
0
    def test_parse_10k_files(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn.csv.gz"
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10
        maxFilenum = 10000
        for filenum in range(maxFilenum):
            rowData = rand_rowData()
            filePrefix = "%04d" % filenum
            csvPathname = SYNDATASETS_DIR + '/' + filePrefix + "_" + csvFilename
            write_syn_dataset_gz(csvPathname, totalRows, headerData, rowData)

        avgFileSize = os.path.getsize(csvPathname)

        importFolderPath = SYNDATASETS_DIR
        print "\nimportFolderPath:", importFolderPath
        csvFilenameList = [
            # try one thousand files first
            ("*[1][0-9][0-9][0-9]_syn.csv.gz", "syn_all.1000.csv", maxFilenum * avgFileSize, 1200),
            # try two thousand
            ("*[1-2][0-9][0-9][0-9]_syn.csv.gz", "syn_all.2000.csv", maxFilenum * avgFileSize, 1200),
            ]

        trialMax = 1
        base_port = 54321
        tryHeap = 4
        DO_GLM = True
        noPoll = False
        benchmarkLogging = ['cpu','disk', 'iostats'] # , 'jstack'
        benchmarkLogging = ['cpu','disk']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(3,java_heap_GB=tryHeap, base_port=base_port,
                    enable_benchmark_log=True)
            else:
                h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, 
                    enable_benchmark_log=True)
            ### h2b.browseTheCloud()

            # don't let the config json redirect import folder to s3 or s3n, because
            # we're writing to the syn_datasets locally. (just have to worry about node 0's copy of this state)
            print "This test creates files in syn_datasets for import folder\n" + \
                "so h2o and python need to be same machine"
            h2o.nodes[0].redirect_import_folder_to_s3_path = False
            h2o.nodes[0].redirect_import_folder_to_s3n_path = False

            for trial in range(trialMax):
                # nice to have the list of what got imported, so we delete "just that" down below
                # doing this just so we can see what we import
                (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                importFullList = importResult['files']
                print "importFullList:", importFullList
                importFailList = importResult['fails']
                print "importFailList:", importFailList
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseResult = h2i.import_parse(path=importFolderPath+"/*",
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']

                # BUG here?
                if not noPoll:
                    h2o_cmd.get_columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
                        
                print "\n" + csvFilepattern

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    GLMkwargs = {'y': 0, 'case': 1, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Beispiel #10
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),

                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),

                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600),

                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),

                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600),

                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600),

                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, 
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i+2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseKey)
                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==1:
            # importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            # importFolderPath = 'more1_1200_link'
            importFolderPath = 'manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            # this pattern from browser correctly does 100 files, 1M rowsj
            # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz
            csvFilenameAll = [
                ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600),
                ("file_[1-2].dat.gz", "file_2_A.dat.gz", 2 * avgMichalSize, 3600),
                ("file_[1-4].dat.gz", "file_4_A.dat.gz", 4 * avgMichalSize, 3600),
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 180
        retryDelaySecs = 10

        localhost = h2o.decide_if_localhost()
        if localhost:
            tryHeap = 4
            h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True)
        else:
            tryHeap = 28
            h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, 
                enable_benchmark_log=True)

        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2
            h2o.beta_features = True

            for trial in range(trialMax):
                # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                if DO_IMPORT_CHECK:
                    for i in range(2):
                        csvPathname = importFolderPath + "/" + csvFilepattern
                        (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', 
                                path=csvPathname, schema='local', timeoutSecs=timeoutSecs)

                        importFullList = importResult['files']
                        importFailList = importResult['fails']
                        print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                        # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                csvPathname = importFolderPath + "/" + csvFilepattern
                start = time.time()
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                h2o_cmd.infoFromInspect(inspect, csvPathname)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern,
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        start = time.time()
                        parseResult = h2i.import_parse(path=csvPathname,
                            hex_key=csvFilename + ".hex", 
                            timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                    if (i+2) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        parseResult = h2i.import_parse(path=csvPathname,
                            hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)


                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']

                # BUG here?
                if not noPoll:
                    pass
                    # We should be able to see the parse result?
                    # h2o_cmd.check_enums_from_inspect(parseResult)
                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRF takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                # print "Waiting 30 secs"
                # time.sleep(30)

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
                h2o.nodes[0].remove_all_keys()

                ### time.sleep(3600)

                ### h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Beispiel #12
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
Beispiel #13
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = "home-0xdiag-datasets"
        importFolderPath = "manyfiles-nflx"
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            if DO_DOUBLE_IMPORT:
                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
                importFullList = importResult["files"]
                importFailList = importResult["fails"]
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key="A.hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # remove the output too! (378)
                ignore_x = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 10,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                # execExpr="A.hex=%s" % parseResult['destination_key']
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = "A.hex[,378+1]=(A.hex[,378+1]>15)"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
Beispiel #14
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
Beispiel #15
0
    def test_benchmark_import(self):
        covtype200xSize = 15033863400

        csvFilenameList = [
            ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),
            ]

        trialMax = 1
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10
        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    enable_benchmark_log=True)
            else:
                h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap/2, base_port=base_port, 
                    enable_benchmark_log=True)

            for trial in range(trialMax):
                csvPathname = "/home/0xdiag/datasets/standard/" + csvFilepattern

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseKey)
                        
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern

                #**********************************************************************************
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(54) # don't include the output column
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 54, 'case': 1, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                h2o_cmd.checkKeyDistribution()
                h2o.tear_down_cloud()

                sys.stdout.write('.')
                sys.stdout.flush() 
    def test_parse_manyfiles_fvec(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==1:
            # importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            # importFolderPath = 'more1_1200_link'
            importFolderPath = 'manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            # this pattern from browser correctly does 100 files, 1M rowsj
            # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz
            csvFilenameAll = [
                ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600),
                ("file_[1-2].dat.gz", "file_2_A.dat.gz", 2 * avgMichalSize, 3600),
                ("file_[1-4].dat.gz", "file_4_A.dat.gz", 4 * avgMichalSize, 3600),
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 180
        retryDelaySecs = 10

        tryHeap = 6
        h2o.init(2, java_heap_GB=tryHeap, enable_benchmark_log=True)

        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?

            for trial in range(trialMax):
                # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                if DO_IMPORT_CHECK:
                    for i in range(2):
                        csvPathname = importFolderPath + "/" + csvFilepattern
                        (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', 
                                path=csvPathname, schema='local', timeoutSecs=timeoutSecs)

                        importFullList = importResult['files']
                        importFailList = importResult['fails']
                        print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                        # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                csvPathname = importFolderPath + "/" + csvFilepattern
                start = time.time()
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                h2o_cmd.infoFromInspect(inspect, csvPathname)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern,
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        start = time.time()
                        parseResult = h2i.import_parse(path=csvPathname,
                            hex_key=csvFilename + ".hex", 
                            timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                    if (i+2) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        parseResult = h2i.import_parse(path=csvPathname,
                            hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)


                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print "Parse result['destination_key']:", parseResult['destination_key']

                # BUG here?
                if not noPoll:
                    pass
                    # We should be able to see the parse result?
                    # h2o_cmd.check_enums_from_inspect(parseResult)
                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRF takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                # print "Waiting 30 secs"
                # time.sleep(30)

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
                h2o.nodes[0].remove_all_keys()

                ### time.sleep(3600)

                ### h2o.tear_down_cloud()
                if not h2o.localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Beispiel #17
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                # double import still causing problems?
                # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                # importFullList = importResult['files']
                # importFailList = importResult['fails']
                # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'family': 'binomial',
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Beispiel #18
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # output 378 can't be in this
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 10, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), 
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), 
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1==1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), 
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), 
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), 
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), 
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),

                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),

                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600),

                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),

                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600),

                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600),

                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                # for now, take too long on 2x100GB heap on 164
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1==0: 
            importFolderPath = '/home/0xdiag/datasets/standard'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),

                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
                ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'


        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(base_port=base_port, 
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i+2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False)

                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Beispiel #20
0
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                # print "Waiting 30 secs"
                # time.sleep(30)

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
                h2o.nodes[0].remove_all_keys()

                ### time.sleep(3600)

                ### h2o.tear_down_cloud()
                if not h2o.localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 

if __name__ == '__main__':
    h2o.unit_main()