def completionHack(jobKey, modelKey):
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
     print "FIX! how do we get the GLM result"
     # hack it!
     params = {'job_key': jobKey, 'destination_key': modelKey}
     a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMProgressPage2.json", params=params)
     print "GLM result from completion_redirect:", h2o.dump_json(a)
Exemple #2
0
    def test_rf_big1_nopoll(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(1):
            start = time.time()
            kwargs = {}
            # FIX! what model keys do these get?
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\
                timeoutSecs=300, noPoll=True, **kwargs)
            rfViewInitial.append(rfView)
            print "rf job dispatch end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='GLMModel',
                              timeoutSecs=30,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for rfView in rfViewInitial:
            print "Checking completed job, with no polling:", rfView
            a = h2o.nodes[0].poll_url(rf['response'], noPoll=True)
            h2o_rf.simpleCheckRFView(None, a)
Exemple #3
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = False
        DO_MODEL_INSPECT = False
        trees = ",".join(map(str,range(10,50,2)))
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append( (job_key, model_key) )

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key  in jobs:

            gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key)
            print "speedrf grid result for %s:", h2o.dump_json(gridResult)

            print "speedrf grid result errors:", gridResult['prediction_errors']
            for i,j in enumerate(gridResult['jobs']):
                if DO_MODEL_INSPECT:
                    print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key']))
                else:
                    # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    print "model:", h2o.dump_json(model)
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        
        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, 
            hex_key=hex_key, timeoutSecs=30, schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, 
                timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            ntree = rfView['ntree']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
            if first is None: # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
    def test_GLM2grid_covtype_many(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append( (job_key, grid_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
Exemple #6
0
    def test_GLM_big1_nopoll(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(40):
            kwargs = {'x': x, 'y': y, 'n_folds': 1}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for glm in glmInitial:
            print "Checking completed job, with no polling:", glm
            a = h2o.nodes[0].poll_url(glm['response'], noPoll=True)
            h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
Exemple #7
0
 def test_sequential_diff_dest(self):
     csvPathname = 'poker/poker-hand-testing.data'
     for trials in range(30):
         hex_key = csvPathname + "_" + str(trials)
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
    def test_GLM_big1_nopoll(self):
        csvPathname = 'hhp_107_01.data.gz'
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(10):
            kwargs = {'x': x, 'y': y, 'n_folds': 1}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for glm in glmInitial:
            print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm)
        
            a = h2o.nodes[0].poll_url(glm, noPoll=True)
            h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
Exemple #9
0
    def test_parse_airline_multi_hdfs_many(self):

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     use_hdfs=True,
                     hdfs_name_node=NAME_NODE,
                     hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='hdfs',
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"

                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"

                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern,
                                             hex_key=hex_key,
                                             noPoll=True,
                                             delete_on_done=0,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Exemple #10
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = True
        trees = ",".join(map(str, range(1, 4)))
        trees = "1,2"
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              response='C11',
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append((job_key, model_key))

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key in jobs:
            gridResult = h2o.nodes[0].speedrf_grid_view(
                job_key=job_key, destination_key=model_key)
            # h2o_rf.showRFGridResults(GBMResult, 15)

            print "speedrf grid result for %s:", model_key, h2o.dump_json(
                gridResult)
Exemple #11
0
    def test_parse_unlock(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_training.csv.gz", 600),
            ("mnist_testing.csv.gz", 600),
        ]

        trial = 0
        allDelta = []
        for (csvFilename, timeoutSecs) in csvFilelist:
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # can't import the dir again while the src file is being parsed
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=importFolderPath + "/" + csvFilename,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                intermediateResults=DO_INTERMEDIATE_RESULTS,
                noPoll=True)
            trial += 1

        # can't unlock while jobs are running
        # Session WARN: java.lang.UnsupportedOperationException: Cannot unlock all keys since locking jobs are still running.
        h2j.pollWaitJobs()

        h2o.n0.unlock()
Exemple #12
0
    def test_B_kmeans_benign(self):
        h2o.beta_features = True
        csvPathname = "logreg"
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            params = {'k': 3, 
                      'initialization': 'Furthest', 
                      'ignored_cols' : None, 
                      'destination_key': 'benign_k.hex',
                      'max_iter': 50,
                      'seed': 265211114317615310,
                     }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Exemple #13
0
    def test_GLM_big1_nopoll(self):
        csvPathname = 'hhp_107_01.data.gz'
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(10):
            kwargs = {'x': x, 'y': y, 'n_folds': 1}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for glm in glmInitial:
            print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm)
        
            a = h2o.nodes[0].poll_url(glm, noPoll=True)
            h2o_glm.simpleCheckGLM(self, a, 'C58', **kwargs)
 def test_sequential_diff_dest(self):
     csvPathname = 'poker/poker-hand-testing.data'
     for trials in range(30):
         hex_key = csvPathname + "_" + str(trials)
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
Exemple #15
0
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            params = {'k': 3, 
                     'initialization': 'Furthest', 
                     'ignored_cols': "ID",
                     'destination_key': 'prostate_k.hex',
                     'max_iter': 100,
                     'seed': 265211114317615310
                    }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_GBM_parseTrain(self):
        bucket = 'home-0xdiag-datasets'
       
        files = [('standard', 'covtype.data', 'covtype.hex', 1800, 54)
                ]
                  
        for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename,
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = { 
                'destination_key': "GBMKEY",
                'learn_rate':.1,
                'ntrees':1,
                'max_depth':1,
                'min_rows':1,
                'response':response
            }   
            print "Using these parameters for GBM: ", params
            kwargs = params.copy()
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)
            h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
            #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
            #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
            GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
            print GBMView['gbm_model']['errs']
    def test_parse_unlock(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_training.csv.gz", 600),
            ("mnist_testing.csv.gz", 600),
        ]

        trial = 0
        allDelta = []
        for (csvFilename,  timeoutSecs) in csvFilelist:
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # can't import the dir again while the src file is being parsed
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', 
                path=importFolderPath+"/"+csvFilename,
                schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, 
                intermediateResults=DO_INTERMEDIATE_RESULTS,
                noPoll=True)
            trial += 1


        # can't unlock while jobs are running
        # Session WARN: java.lang.UnsupportedOperationException: Cannot unlock all keys since locking jobs are still running.
        h2j.pollWaitJobs()

        h2o.n0.unlock()
Exemple #18
0
    def test_GLM2grid_covtype_many(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=20)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append( (job_key, grid_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
Exemple #19
0
 def test_overlap_same_dest_nopoll(self):
     for num_trials in range(30):
         csvPathname = 'poker/poker-hand-testing.data'
         src_key = csvPathname
         hex_key = csvPathname + '.hex'
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=True, doSummary=False)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
 def test_overlap_same_dest_nopoll(self):
     for num_trials in range(30):
         csvPathname = 'poker/poker-hand-testing.data'
         src_key = csvPathname
         hex_key = csvPathname + '.hex'
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=True, doSummary=False)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname, schema='hdfs',
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"
    
                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"
                
                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_GBM_parseTrain(self):
        #folderpath, filename, keyname, timeout
        bucket = 'home-0xdiag-datasets'
        
        files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex',1800,0)
                ]

        grid = [[1,10,100,1000], [0.0,0.01,0.001,0.0001,1], [1,2], [1,10,100]] 
        grid = list(itertools.product(*grid))
        grid = random.sample(grid, 10) #don't do all 120, but get a random sample
        for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']
            csv_header = ('nJVMs','java_heap_GB', 'dataset', 'ntrees', 'max_depth', 'learn_rate', 'min_rows','trainTime')
            for ntree, learn_rate, max_depth, min_rows in grid:
                if not os.path.exists('gbm_grid.csv'):
                    output = open('gbm_grid.csv', 'w')
                    output.write(','.join(csv_header)+'\n')
                else:
                    output = open('gbm_grid.csv', 'a')

                csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
                        dialect='excel', extrasaction='ignore',delimiter=',')
                java_heap_GB = h2o.nodes[0].java_heap_GB

                params = {
                 'destination_key': 'GBMKEY',
                 'learn_rate': learn_rate,
                 'ntrees':ntree,
                 'max_depth':max_depth,
                 'min_rows':min_rows,
                 'response':response
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True
                #noPoll -> False when GBM finished
                start = time.time()
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,noPoll=True,timeoutSecs=timeoutSecs,**kwargs)
                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=3600,pollTimeoutSecs=3600)
                #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
                #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
                #print GBMResult
                GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                print GBMView['gbm_model']['errs']
                elapsed = time.time() - start
                row = {'nJVMs':len(h2o.nodes),'java_heap_GB':java_heap_GB,'dataset':'mnist_training.csv.gz',
                       'learn_rate':learn_rate,'ntrees':ntree,'max_depth':max_depth,
                       'min_rows':min_rows, 'trainTime':elapsed}
                print row
                csvWrt.writerow(row)
    def test_GBMGrid_basic_many(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': '.1,.2',
            'ntrees': '8,10',
            'max_depth': '8,9',
            'min_rows': '1,2',
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            'grid_parallelism': 1,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
    
        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0
        # for more in range(8):
        # fast
        # for more in range(9):
        for i in range(5):
            kwargs = params.copy()
            kwargs['min_rows'] = '1,2,3'
            if DO_FROM_TO_STEP:
                kwargs['max_depth'] = '5:10:1'
            else:
                kwargs['max_depth'] = '5,6,10'

            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            # print "GBMResult:", h2o.dump_json(GBMResult)
            job_key = GBMResult['job_key']
            model_key = GBMResult['destination_key']
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key  in jobs:
            GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key)
            h2o_gbm.showGBMGridResults(GBMResult, 15)

        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs
Exemple #26
0
    def test_GBM_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        colNames = [
            'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL',
            'GLEASON'
        ]

        modelKey = 'GBM_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'validation': parseResult['destination_key'],
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': 10,
            'max_depth': 20,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                        noPoll=True,
                                        **kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None,
                              timeoutSecs=300,
                              pollTimeoutSecs=10,
                              retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView['gbm_model']['cms'][-1][
                '_arr']  # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(
                gbmTrainView['gbm_model']['errs'])
    def test_c9b_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            # passes 5, fails 15
            # for depth in [5,15,25,40]:
            for depth in [5,5,5,5,5]:
                params = {
                    'destination_key': "GBMKEY",
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': 10,
                    'max_depth': depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)

                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_rf_covtype_fvec(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(jobDispatch)
            
            # don't poll for fvec 
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs)
            elapsed = time.time() - start
            print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            print h2o.dump_json(rfResult)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = kwargs['model_key']
            rfView['ntree'] = kwargs['ntree']
            rfViewInitial.append(rfView)

            print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)


        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
Exemple #29
0
    def test_GLM_prostate(self):
        h2o.beta_features=True
        importFolderPath = "logreg"
        csvFilename = 'prostate.csv'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key, 
             timeoutSecs=180, noPoll=True, doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print inspect
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        x         = 'ID'
        y         = 'CAPSULE'
        family    = 'binomial'
        alpha     = '0.5'
        lambda_   = '1E-4'
        nfolds    = '5' # fails
        nfolds    = '0'
        case_mode = '='
        case_val  = '1'
        f         = 'prostate'
        modelKey  = 'GLM(' + f + ')'

        kwargs = {       'response'          : y,
                         'ignored_cols'       : x,
                         'family'             : family,
                         'lambda'             : lambda_,
                         'alpha'              : alpha,
                         'n_folds'            : nfolds, # passes if 0, fails otherwise
                         #'case_mode'          : case_mode,
                         #'case_val'           : case_val, 
                         'destination_key'    : modelKey,
                 }


        BUG1 = True

        timeoutSecs = 60
        
        start = time.time()
        glmFirstResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=0.25, pollTimeoutSecs=180, noPoll=BUG1, **kwargs)

        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
        print "FIX! how do we get the GLM result"
        # hack it!
        job_key = glmFirstResult['job_key']

        # is the job finishing before polling would say it's done?
        params = {'job_key': job_key, 'destination_key': modelKey}
        a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMProgressPage2.json", params=params)
        print "GLM result from completion_redirect:", h2o.dump_json(a)
Exemple #30
0
    def test_GBMGrid_basic_many(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': '.1,.2',
            'ntrees': '8,10',
            'max_depth': '8,9',
            'min_rows': '1,2',
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            'grid_parallelism': 1,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
    
        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0
        # for more in range(8):
        # fast
        # for more in range(9):
        for i in range(50 if DO_FAIL_CASE else 10):
            kwargs = params.copy()
            kwargs['min_rows'] = '1,2,3'
            if DO_FROM_TO_STEP:
                kwargs['max_depth'] = '5:10:1'
            else:
                kwargs['max_depth'] = '5,6,10'

            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            # print "GBMResult:", h2o.dump_json(GBMResult)
            job_key = GBMResult['job_key']
            model_key = GBMResult['destination_key']
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key  in jobs:
            GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key)
            h2o_gbm.showGBMGridResults(GBMResult, 15)

        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs
    def test_B_kmeans_benign(self):
        h2o.beta_features = True # fvec
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, 
            timeoutSecs=180, noPoll=not DO_POLL, doSummary=False)

        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            parseResult['destination_key'] = hex_key
        
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([10.5, 2.8, 40.3, 0.0, 12.0, 0.8, 1.6, 21.1, 11.4, 0.7, 2.9, 206.2, 36.7, 1.5], 15, 0) ,
            ([23.72897196261682, 2.3271028037383177, 44.81308411214953, 0.34579439252336447, 13.093457943925234, 1.4579439252336448, 1.3177570093457944, 24.16129367150993, 13.317757009345794, 0.5071931108136043, 2.6604011393039024, 121.6822429906542, 40.13084112149533, 1.691588785046729], 110, 0) ,
            ([29.2625, 2.7, 48.5125, 0.1625, 12.0625, 1.0375, 1.4875, 23.023665714263917, 12.6875, 0.5073033705353737, 3.090870788693428, 160.95, 43.3, 1.65], 71, 0) ,
            ([38.333333333333336, 2.3333333333333335, 52.666666666666664, 0.0, 14.333333333333334, 2.3333333333333335, 1.6666666666666667, 25.85955047607422, 12.0, 0.5056179761886597, 3.2846442063649497, 261.6666666666667, 43.0, 1.0], 4, 0) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        if DO_IGNORE:
            kwargs = {'k': 4, 'ignored_cols': 'STR', 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50}
        else:
            kwargs = {'k': 4, 'ignored_cols': None, 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50}

        kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs)

        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            # hack..supposed to be there like va
            kmeans['destination_key'] = 'benign_k.hex'
        ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
        modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex')
        h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
        model = modelView['model']
        clusters = model['clusters']
        cluster_variances = model['cluster_variances']
        error = model['error']
        print "cluster_variances:", cluster_variances
        print "error:", error

        # make this fvec legal?
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
    def test_parse_with_cancel(self):
        mustWait = 10
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameList = [
            ("standard", "covtype.data", 54),
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            ("standard", "covtype20x.data", 54),
            ("manyfiles-nflx-gz", "file_[100-109].dat.gz", 378),
            ]

        # just loop on the same file. If remnants exist and are locked, we will blow up? 
        # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces
        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            hex_key = csvFilename + ".hex"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50)

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key,
                timeoutSecs=500, noPoll=True, doSummary=False)
            job_key = parseResult['job_key']

            # give it a little time to start
            time.sleep(3)
            h2o.nodes[0].jobs_cancel(key=job_key)

            # now wait until the job cancels, and we're idle
            h2o_jobs.pollWaitJobs(timeoutSecs=30)
            elapsed = time.time() - start
            print "Cancelled parse completed in", elapsed, "seconds."

            h2o.check_sandbox_for_errors()
            # get a list of keys from storview. 20 is fine..shouldn't be many, since we putfile, not import folder
            # there maybe a lot since we import the whole "standard" folder
            # find the ones that pattern match the csvFilename, and inspect them. Might be none
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100)
            keys = storeViewResult['keys']
            for k in keys:
                keyName = k['key']
                print "kevin:", keyName
                if csvFilename in keyName:
                    h2o_cmd.runInspect(key=keyName)
                    h2o.check_sandbox_for_errors()

            # This will tell h2o to delete using the key name from the import file, whatever pattern matches to csvFilename
            # we shouldn't have to do this..the import/parse should be able to overwrite without deleting.
            # h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

            # If you cancel a parse, you aren't allowed to reparse the same file or import a directory with that file,
            # or cause the key name that the parse would have used, for 5 seconds after the cancel request gets a json
            # response
            print "Waiting", mustWait, "seconds before next reparse-cancel."
            time.sleep(mustWait)
    def test_GBMGrid_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]
        modelKey = 'GBMGrid_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'STR',
            'learn_rate': '.1,.2',
            'ntrees': 2,
            'max_depth': 8,
            'min_rows': 1,
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
        # no pattern waits for all
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)

        if 1==0:
            # FIX! get model?
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Exemple #34
0
def parseImportFolderFile(node=None, csvFilename=None, path=None, key2=None,
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=1, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, **kwargs):
    if not node: node = h2o.nodes[0]

    if not csvFilename: raise Exception('parseImportFolderFile: No csvFilename')

    # We like the short parse key2 name. 
    # We don't drop anything from csvFilename, unlike H2O default
    if key2 is None:
        # don't rely on h2o default key name
        myKey2 = csvFilename + '.hex'
    else:
        myKey2 = key2

    print "Waiting for the slow parse of the file:", csvFilename

    # a little hack to redirect import folder tests to an s3 folder
    if node.redirect_import_folder_to_s3_path:
        # why no leading / for s3 key here. only one / after s3:/ ?
        path = re.sub('/home/0xdiag/datasets', 'home-0xdiag-datasets', path)
        parseKey = parseImportS3File(node, csvFilename, path, myKey2,
            timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
            benchmarkLogging, noPoll)
    elif node.redirect_import_folder_to_s3n_path: 
        path = re.sub('/home/0xdiag/datasets', '/home-0xdiag-datasets', path)
        parseKey = parseImportHdfsFile(node, csvFilename, path, myKey2, "s3n",
            timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
            benchmarkLogging, noPoll)
    else:
        if getpass.getuser()=='jenkins':
            print "Now: not doing Temp hack of /home/0xdiag/datasets/standard to /home/0xdiag/datasets"
            ### path = re.sub('/home/0xdiag/datasets/standard', '/home/0xdiag/datasets', path)
        importKey = "nfs:/" + path + "/" + csvFilename
        if h2o.beta_features:
            print "Temp hack to look at the jobs list for parse completion. No multiple outstanding parses"
            print "The parse result will be just from the first noPoll response. Parse is done as noPoll"

        parseKey = node.parse(importKey, myKey2, 
            timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
            benchmarkLogging, noPoll=noPoll or h2o.beta_features, **kwargs)

        if h2o.beta_features:
            print "Temp hack to look at the jobs list for parse completion. No multiple outstanding parses"
            print "The parse result will be just from the first noPoll response."
            print "\nWaiting on Parse job for ", importKey
            start = time.time()
            h2o_jobs.pollWaitJobs(pattern='arse', timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5)
            print "Parse job end for ", importKey, 'took', time.time() - start, 'seconds'

         # a hack so we know what the source_key was, bask at the caller
        parseKey['python_source_key'] = importKey
        print "\nParse result:", parseKey
    return parseKey
Exemple #35
0
 def test_sequential_same_dest_del(self):
     csvFilename = 'poker-hand-testing.data'
     csvPathname = 'poker/' + csvFilename
     for trials in range(30):
         src_key = csvPathname
         hex_key = csvPathname + '.hex'
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False)
         h2o.nodes[0].remove_key(src_key)
         h2o.nodes[0].remove_key(hex_key)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
    def test_GBMGrid_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]
        modelKey = 'GBMGrid_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'STR',
            'learn_rate': '.1,.2',
            'ntrees': 2,
            'max_depth': 8,
            'min_rows': 1,
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
        # no pattern waits for all
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)

        if 1==0:
            # FIX! get model?
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
    def test_KMeansGrid_params_rand2_fvec(self):
        if h2o.localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ("covtype.data", 800)
            ]
        else:
            csvFilenameList = [("covtype.data", 800)]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60
            )
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + ".hex"
                params = {"k": "2,3", "destination_key": destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(
                    parseResult=parseResult,
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=2,
                    pollTimeoutSecs=60,
                    noPoll=True,
                    **kwargs
                )
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100
                )
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
 def test_sequential_same_dest_del(self):
     csvFilename = 'poker-hand-testing.data'
     csvPathname = 'poker/' + csvFilename
     for trials in range(30):
         src_key = csvPathname
         hex_key = csvPathname + '.hex'
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
             src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=False, doSummary=False)
         h2o.nodes[0].remove_key(src_key)
         h2o.nodes[0].remove_key(hex_key)
     h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([43.07058823529412, 0.36470588235294116, 67.70588235294117, 1.1058823529411765, 2.3529411764705883, 1.2117647058823529, 17.33529411764706, 14.201176470588232, 6.588235294117647], 103, 0) ,
            ([166.04347826086956, 0.4658385093167702, 66.09316770186335, 1.0807453416149069, 2.3043478260869565, 1.0807453416149069, 15.0632298136646, 16.211118012422357, 6.527950310559007], 136, 0) ,
            ([313.4029850746269, 0.35074626865671643, 64.91791044776119, 1.0820895522388059, 2.1791044776119404, 1.0746268656716418, 14.601492537313437, 16.35686567164179, 6.082089552238806], 141, 0) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        kwargs = {'k': 3, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex', 'max_iter': 50,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310}

        # for fvec only?
        kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            # hack..supposed to be there like va
            kmeans['destination_key'] = 'prostate_k.hex'
        # FIX! how do I get the kmeans result?
        ### print "kmeans result:", h2o.dump_json(kmeans)
        # can't do this
        # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
        modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex')
        h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

        model = modelView['model']
        clusters = model['clusters']
        cluster_variances = model['cluster_variances']
        error = model['error']
        print "cluster_variances:", cluster_variances
        print "error:", error
        # variance of 0 might be legal with duplicated rows. wasn't able to remove the duplicate rows of NAs at 
        # bottom of benign.csv in ec2
        # for i,c in enumerate(cluster_variances):
        #    if c < 0.1:
        #        raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i))
        

        # make this fvec legal?
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    "destination_key": "GBMKEY",
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": 10,
                    "max_depth": depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed",
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                # noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = (statMean["num_cpus"],)
                my_cpu_pct = (statMean["my_cpu_%"],)
                sys_cpu_pct = (statMean["sys_cpu_%"],)
                system_load = statMean["system_load"]
                # shouldn't need this?
                h2j.pollWaitJobs(
                    pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5
                )
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                # print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_KMeansGrid_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': '2,3', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_B_kmeans_benign(self):
        importFolderPath = "standard"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, 
            timeoutSecs=180, noPoll=h2o.beta_features, doSummary=False)

        if h2o.beta_features:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            parseResult['destination_key'] = hex_key
        
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for k in range(2, 6):
            kwargs = {'k': k, 'ignored_cols_by_name': None, 'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 10})
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs)

            if h2o.beta_features:
                h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
                # hack..supposed to be there like va
                kmeans['destination_key'] = 'benign_k.hex'
            ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
            modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
            model = modelView['model']
            clusters = model['clusters']
            cluster_variances = model['cluster_variances']
            error = model['error']
            print "cluster_variances:", cluster_variances
            print "error:", error
    def test_KMeansGrid_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            h2o.beta_features = True # no grid for VA
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': 'c(2,3)', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()
        
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
 def test_overlap_diff_dest_stallN(self):
     noPoll = True
     num_trials = 0 
     stallForNJobs = 25
     for i in range(2):
         for j in range(30):
             csvFilename = 'poker-hand-testing.data'
             csvPathname = 'poker/' + csvFilename
             src_key = csvFilename + "_" + str(i) + "_" + str(j)
             hex_key =  csvFilename + "_" + str(num_trials) + '.hex'
             parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                 src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=noPoll,
                 doSummary=False)
             num_trials += 1
         h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5,stallForNJobs=stallForNJobs)
Exemple #45
0
 def test_overlap_diff_dest_stallN(self):
     noPoll = True
     num_trials = 0 
     stallForNJobs = 25
     for i in range(2):
         for j in range(30):
             csvFilename = 'poker-hand-testing.data'
             csvPathname = 'poker/' + csvFilename
             src_key = csvFilename + "_" + str(i) + "_" + str(j)
             hex_key =  csvFilename + "_" + str(num_trials) + '.hex'
             parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                 src_key=src_key, hex_key=hex_key, timeoutSecs=120, noPoll=noPoll,
                 doSummary=False)
             num_trials += 1
         h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5,stallForNJobs=stallForNJobs)
Exemple #46
0
    def test_GBM_parseTrain(self):
        bucket = 'home-0xdiag-datasets'

        files = [('standard', 'covtype200x.data', 'covtype.hex', 1800, 54),
                 ('mnist', 'mnist8m.csv', 'mnist8m.hex', 1800, 0),
                 ('manyfiles-nflx-gz', 'file_95.dat.gz', 'nflx.hex', 1800,
                  256),
                 ('standard', 'allyears2k.csv', 'allyears2k.hex', 1800,
                  'IsArrDelayed'),
                 ('standard', 'allyears.csv', 'allyears2k.hex', 1800,
                  'IsArrDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=importFolderPath + "/" +
                                           csvFilename,
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 1,
                'max_depth': 1,
                'min_rows': 1,
                'response': response
            }
            print "Using these parameters for GBM: ", params
            kwargs = params.copy()
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                       noPoll=True,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            h2j.pollWaitJobs(pattern="GBMKEY",
                             timeoutSecs=1800,
                             pollTimeoutSecs=1800)
            #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
            #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
            GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
            print GBMView['gbm_model']['errs']
    def test_C_kmeans_prostate(self):

        importFolderPath = "standard"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for k in range(2, 6):
            kwargs = {'k': k, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50})

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs)
            if h2o.beta_features:
                h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
                # hack..supposed to be there like va
                kmeans['destination_key'] = 'prostate_k.hex'
            # FIX! how do I get the kmeans result?
            ### print "kmeans result:", h2o.dump_json(kmeans)
            # can't do this
            # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
            modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

            model = modelView['model']
            clusters = model['clusters']
            cluster_variances = model['cluster_variances']
            error = model['error']
            print "cluster_variances:", cluster_variances
            print "error:", error
            for i,c in enumerate(cluster_variances):
                if c < 0.1:
                    raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i))
Exemple #48
0
 def test_small_parse_overlap_same_dest(self):
     noPoll = True
     timeoutSecs = 180
     num_trials = 0
     stallForNJobs = 100
     for i in range(50):
         for j in range(200):
             csvPathname = h2o.find_file('smalldata/poker')
             csvFilename = csvPathname + '/' + 'poker-hand-testing.data'
             key = csvFilename + "_" + str(i) + "_" + str(j)
             key2 =  key + "_" + str(num_trials) + '.hex'
             parseKey = h2o_cmd.parseFile(csvPathname=csvFilename, 
                 key=key, key2=key2, timeoutSecs=timeoutSecs, noPoll=noPoll,
                 doSummary=False)
             num_trials += 1
         h2o_jobs.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5,stallForNJobs=stallForNJobs)
Exemple #49
0
    def test_PCA_UCIwine(self):
        csvFilename = "wine.data"
        timeoutSecs = 300
        trialStart = time.time()
        #parse
        trainKey = "wine.hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       schema='local',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        #PCA params
        params = {
            'destination_key': "python_PCA_key",
            'tolerance': 0.0,
            'standardize': 1
        }

        kwargs = params.copy()
        h2o.beta_features = True
        #TODO(spencer): Hack around no polling FVEC
        PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
        start = time.time()
        h2o_cmd.runPCA(parseResult=parseResult,
                       timeoutSecs=timeoutSecs,
                       noPoll=True,
                       returnFast=False,
                       **kwargs)
        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                         pollTimeoutSecs=120,
                         retryDelaySecs=2)
        #time.sleep(100)
        elapsed = time.time() - start
        PCAResult['python_elapsed'] = elapsed
        PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
        print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
              "%f pct. of timeout" % (PCAResult['python_%timeout'])
        #check PCA results
        pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key")
        h2o_pca.simpleCheckPCA(self, pcaView)
        h2o_pca.resultsCheckPCA(self, pcaView)
Exemple #50
0
    def test_exec2_fast_locks_overlap(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        lastHexKey = None
        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)

            # wait until iteration 2, when lastHexKey is available, so you can operate on that
            if lastHexKey:
                execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            lastHexKey = hex_key

            # since we are using the same source file, and potentially re-uploading if AVOID_BUG
            # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to 
            # use it next iteration
            h2o_jobs.pollWaitJobs(timeoutSecs=10)
            
        # just show the jobs still going. Shouldn't be any
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemple #51
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = False
        DO_MODEL_INSPECT = False
        trees = ",".join(map(str, range(10, 50, 2)))
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              response='C11',
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append((job_key, model_key))

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key in jobs:

            gridResult = h2o.nodes[0].speedrf_grid_view(
                job_key=job_key, destination_key=model_key)
            print "speedrf grid result for %s:", h2o.dump_json(gridResult)

            print "speedrf grid result errors:", gridResult[
                'prediction_errors']
            for i, j in enumerate(gridResult['jobs']):
                if DO_MODEL_INSPECT:
                    print "\nspeedrf result %s:" % i, h2o.dump_json(
                        h2o_cmd.runInspect(key=j['destination_key']))
                else:
                    # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    model = h2o.nodes[0].speedrf_view(
                        modelKey=j['destination_key'])
                    print "model:", h2o.dump_json(model)
Exemple #52
0
    def test_GLM2_big1_nopoll(self):
        h2o.beta_features = True
        csvPathname = 'hhp_107_01.data.gz'
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(5):
            kwargs = {'response': y, 'n_folds': 1, 'family': 'binomial'}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for g in glmInitial:
            print "Checking completed job, with no polling using initial response:"
            # this format is only in the first glm response (race?)
            modelKey = g['destination_key']
            glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
            h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)

            cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr']
            print "cm:", cm
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), 
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), 
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1==1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), 
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), 
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), 
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), 
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),

                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),

                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600),

                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),

                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600),

                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600),

                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                # for now, take too long on 2x100GB heap on 164
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1==0: 
            importFolderPath = '/home/0xdiag/datasets/standard'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),

                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
                ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'


        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(base_port=base_port, 
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i+2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False)

                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
Exemple #54
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvTrainPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=180,
                                            doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvTestPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i == (TRIES - 1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs,
                                         noPoll=True,
                                         rfView=False,
                                         **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)

            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runRFView(None,
                                       model_key=model_key,
                                       timeoutSecs=60,
                                       retryDelaySecs=5,
                                       doSimpleCheck=False)
            print "rfView:", h2o.dump_json(rfView)

            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(
                len(classErrorPctList), 7,
                "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict"
            )
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Exemple #55
0
    def test_flashgordon(self):
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        csvFilenameList = [
            ("100.dat.gz", "dat_1", 1 * avgSynSize, 700),
            ("11[0-9].dat.gz", "dat_10", 10 * avgSynSize, 700),
            ("1[32][0-9].dat.gz", "dat_20", 20 * avgSynSize, 800),
            ("1[5-9][0-9].dat.gz", "dat_50", 50 * avgSynSize, 900),
            # ("1[0-9][0-9].dat.gz", "dat_100", 100 * avgSynSize, 1200),
        ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        USE_S3 = False
        noPoll = True
        benchmarkLogging = ['cpu', 'disk']
        bucket = "home-0xdiag-datasets"
        if USE_S3:
            URI = "s3://flashgordon"
            protocol = "s3"
        else:
            URI = "s3n://flashgordon"
            protocol = "s3n/hdfs"

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            ## for tryHeap in [54, 28]:
            for tryHeap in [54]:

                print "\n", tryHeap, "GB heap, 1 jvm per host, import", protocol, "then parse"
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1,
                    java_heap_GB=tryHeap,
                    enable_benchmark_log=True,
                    timeoutSecs=120,
                    retryDelaySecs=10,
                    # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
                    # this is for our amazon ec hdfs
                    # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
                    hdfs_name_node='10.78.14.235:9000',
                    hdfs_version='0.20.2')

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandbox_ignore_errors = True

                for trial in range(trialMax):
                    # since we delete the key, we have to re-import every iteration, to get it again
                    # s3n URI thru HDFS is not typical.
                    if USE_S3:
                        importResult = h2o.nodes[0].import_s3(bucket)
                    else:
                        importResult = h2o.nodes[0].import_hdfs(URI)

                    s3nFullList = importResult['succeeded']
                    for k in s3nFullList:
                        key = k['key']
                        # just print the first tile
                        # if 'nflx' in key and 'file_1.dat.gz' in key:
                        if csvFilepattern in key:
                            # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                            print "example file we'll use:", key
                            break
                        else:
                            ### print key
                            pass

                    ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                    # error if none?
                    self.assertGreater(len(s3nFullList), 8,
                                       "Didn't see more than 8 files in s3n?")

                    s3nKey = URI + "/" + csvFilepattern
                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", s3nKey, "to", key2
                    start = time.time()
                    parseKey = h2o.nodes[0].parse(
                        s3nKey,
                        key2,
                        timeoutSecs=timeoutSecs,
                        retryDelaySecs=10,
                        pollTimeoutSecs=60,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        s3nKey = URI + "/" + csvFilepattern
                        key2 = csvFilename + "_" + str(trial) + ".hex"
                        print "Loading", protocol, "key:", s3nKey, "to", key2
                        parse2Key = h2o.nodes[0].parse(
                            s3nKey,
                            key2,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=10,
                            pollTimeoutSecs=60,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        s3nKey = URI + "/" + csvFilepattern
                        key2 = csvFilename + "_" + str(trial) + ".hex"
                        print "Loading", protocol, "key:", s3nKey, "to", key2
                        parse3Key = h2o.nodes[0].parse(
                            s3nKey,
                            key2,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=10,
                            pollTimeoutSecs=60,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print s3nKey, 'parse time:', parseKey['response']['time']
                    print "parse result:", parseKey['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(
                            pattern=csvFilename,
                            timeoutSecs=timeoutSecs,
                            benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes / 1e6) / elapsed
                        print "\nMB/sec (before uncompress)", "%6.2f" % fileMBS
                        h2o.cloudPerfH2O.message(
                            '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'
                            .format(len(h2o.nodes), tryHeap, csvFilepattern,
                                    csvFilename, fileMBS, elapsed))

                    # BUG here?
                    if not noPoll:
                        # We should be able to see the parse result?
                        inspect = h2o_cmd.runInspect(
                            key=parseKey['destination_key'])

                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."

                    storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    for k in s3nFullList:
                        deleteKey = k['key']
                        if csvFilename in deleteKey and not ".hex" in key:
                            pass
                            # h2o removes key after parse now
                            ### print "Removing", deleteKey
                            ### removeKeyResult = h2o.nodes[0].remove_key(key=deleteKey)
                            ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                time.sleep(120)
Exemple #56
0
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees,
          depth, minrows, nbins, learnRate, response, row):
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    for f in fs['train']:
        overallWallStart = time.time()
        pre = ""
        if debug: pre = 'DEBUG'
        gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
        if not os.path.exists(gbmbenchcsv):
            output = open(gbmbenchcsv, 'w')
            output.write(','.join(csv_header) + '\n')
        else:
            output = open(gbmbenchcsv, 'a')
        csvWrt = csv.DictWriter(output,
                                fieldnames=csv_header,
                                restval=None,
                                dialect='excel',
                                extrasaction='ignore',
                                delimiter=',')
        try:
            java_heap_GB = h2o.nodes[0].java_heap_GB
            importFolderPath = bench + folderPath
            if (f in [
                    'AirlinesTrain1x', 'AllBedroomsTrain1x',
                    'AllBedroomsTrain10x', 'AllBedroomsTrain100x',
                    'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x'
            ]):
                csvPathname = importFolderPath + "/" + f + '.csv'
            else:
                csvPathname = importFolderPath + "/" + f + "/*linked*"
            hex_key = f + '.hex'
            hK = folderPath + "Header.csv"
            headerPathname = importFolderPath + "/" + hK
            h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
            headerKey = h2i.find_key(hK)
            trainParseWallStart = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           header=1,
                                           header_from_file=headerKey,
                                           separator=44,
                                           timeoutSecs=7200,
                                           retryDelaySecs=5,
                                           pollTimeoutSecs=7200)

            parseWallTime = time.time() - trainParseWallStart
            print "Parsing training file took ", parseWallTime, " seconds."

            inspect_train = h2o.nodes[0].inspect(
                parseResult['destination_key'])
            inspect_test = h2o.nodes[0].inspect(testFilehex)

            nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(
                h2o_hosts.hosts)
            row.update({
                'h2o_build': build,
                'nMachines': nMachines,
                'nJVMs': len(h2o.nodes),
                'Xmx/JVM': java_heap_GB,
                'dataset': f,
                'nTrainRows': inspect_train['numRows'],
                'nTestRows': inspect_test['numRows'],
                'nCols': inspect_train['numCols'],
                'trainParseWallTime': parseWallTime,
                'classification': classification,
            })

            params = {
                'destination_key': 'GBM(' + f + ')',
                'response': response,
                'ignored_cols_by_name': ignored_cols,
                'classification': classification,
                'validation': testFilehex,
                'ntrees': ntrees,
                'max_depth': depth,
                'min_rows': minrows,
                'nbins': nbins,
                'learn_rate': learnRate,
            }

            kwargs = params.copy()
            gbmStart = time.time()
            #TODO(spencer): Uses jobs to poll for gbm completion
            h2o.beta_features = True
            gbm = h2o_cmd.runGBM(parseResult=parseResult,
                                 noPoll=True,
                                 timeoutSecs=4800,
                                 **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                                  pollTimeoutSecs=120,
                                  retryDelaySecs=5)
            h2o.beta_features = False
            gbmTime = time.time() - gbmStart
            row.update({
                'gbmBuildTime': gbmTime,
            })
            #TODO(spencer): Add in gbm scoring
            #gbmScoreStart = time.time()
            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
            #scoreTime     = time.time() - gbmScoreStart
            csvWrt.writerow(row)
        finally:
            output.close()
Exemple #57
0
    def test_rf_big1_overwrite_model_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['ntrees'] = 1 + jobDispatch
            else:
                kwargs['ntrees'] = 1
                # don't change the seed if we're overwriting the model. It should get
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runRF(node=randomNode,
                          parseResult=parseResult,
                          destination_key=model_key,
                          timeoutSecs=300,
                          noPoll=True,
                          **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None,
                                             data_key,
                                             model_key,
                                             timeoutSecs=60,
                                             noPoll=False)
            if firstRfView is None:  # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       firstRfView,
                                       vice_versa=True,
                                       with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                    msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                        (len(df.difference), h2o.dump_json(df.difference)))
Exemple #58
0
    def test_GBM_manyfiles_multijob(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None,
                 'file_1.dat.gz', 'test.hex')
            ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            # inc by 1 for R col
            # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate
            # only a problem if they share the dataset, do classification with integers.
            # change to factor here, to avoid the problem
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (trainKey)

            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            csvPathname = importFolderPath + "/" + testFilename
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            # plus 1 for R indexing
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            # add 1 for start-with-1
            ignored_cols_by_name = ",".join(
                map(lambda x: "C" + str(x + 1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str(
                response + 1)

            ntrees = 10
            trial = 0
            # ignore 200 random cols (not the response)
            print "Kicking off multiple GBM jobs at once"
            # GBM train****************************************
            if DO_FAIL:
                cases = [5, 10, 20, 40]
            else:
                cases = [5, 10, 20]

            for max_depth in cases:
                trial += 1

                params = {
                    'response': "C" + str(response + 1),
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'validation': parseTestResult['destination_key'],
                    'ignored_cols_by_name': ignored_cols_by_name,
                    'grid_parallelism': 1,
                    'classification': 1 if DO_CLASSIFICATION else 0,
                }

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                # can take 4 times as long with 4 jobs?
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs * 4,
                                                destination_key=modelKey +
                                                "_" + str(trial),
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename

            statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                              pollTimeoutSecs=timeoutSecs,
                                              retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                             pollTimeoutSecs=timeoutSecs)
Exemple #59
0
    def test_benchmark_import(self):
        covtype200xSize = 15033863400

        csvFilenameList = [
            ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),
            ]

        trialMax = 1
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10
        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    enable_benchmark_log=True)
            else:
                h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap/2, base_port=base_port, 
                    enable_benchmark_log=True)

            for trial in range(trialMax):
                csvPathname = "/home/0xdiag/datasets/standard/" + csvFilepattern

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseKey)
                        
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern

                #**********************************************************************************
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(54) # don't include the output column
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 54, 'case': 1, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                h2o_cmd.checkKeyDistribution()
                h2o.tear_down_cloud()

                sys.stdout.write('.')
                sys.stdout.flush()