コード例 #1
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0}

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
コード例 #2
0
ファイル: h2o_glm.py プロジェクト: Jfeng3/h2o
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
# "grid": {
#    "destination_keys": [
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_2"
#   ]
# }, 
    if h2o.beta_features:
        destination_key = glmGridResult['grid']['destination_keys'][0]
        inspectGG = h2o.nodes[0].glm_view(destination_key)
        models = inspectGG['glm_model']['submodels']
        h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0]))
        g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs)
    else:
        destination_key = glmGridResult['destination_key']
        inspectGG = h2o_cmd.runInspect(None, destination_key)
        h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))
        models = glmGridResult['models']
        for m, model in enumerate(models):
            alpha = model['alpha']
            area_under_curve = model['area_under_curve']
            # FIX! should check max error?
            error_0 = model['error_0']
            error_1 = model['error_1']
            model_key = model['key']
            print "#%s GLM model key: %s" % (m, model_key)
            glm_lambda = model['lambda']

        # now indirect to the GLM result/model that's first in the list (best)
        inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key'])
        h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM))
        g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
    return g
コード例 #3
0
ファイル: test_storeview_import.py プロジェクト: 100star/h2o
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #4
0
ファイル: h2o_import.py プロジェクト: hihihippp/h2o
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, **kwargs):

    ## if h2o.beta_features:
    ##     print "HACK: temporarily disabling Summary always in v2 import_parse"
    ##     doSummary = False

    if not node: node = h2o.nodes[0]

    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, **kwargs)

    h2o.verboseprint("importPattern:", importPattern)
    h2o.verboseprint("importResult", h2o.dump_json(importResult))

    parseResult = parse_only(node, importPattern, hex_key,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    h2o.verboseprint("parseResult:", h2o.dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    if doSummary:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        h2o.check_sandbox_for_errors()
        node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        h2o.check_sandbox_for_errors()

    return parseResult
コード例 #5
0
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False)
        (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True)
        (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False)
        (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2))
        self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3))

        self.assertAlmostEqual(ce1pct1, ce2pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1))
        self.assertAlmostEqual(ce1pct1, ce3pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
コード例 #6
0
    def test_exec2_operators(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        csvPathname = 'standard/covtype.data'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)
            print h2o.dump_json(resultExec)
            print 'exec end took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key='a.hex')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
コード例 #7
0
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        importFolderPath = 'airlines'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)


            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            h2o_cmd.runStoreView(timeoutSecs=60)
コード例 #8
0
ファイル: h2o_glm.py プロジェクト: nadya1/h2o
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult["destination_key"]
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    type = inspectGG["type"]
    if "unparsed" in type:
        print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
        print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG["response"]  # dict
    ### rows = inspectGG['rows']
    value_size_bytes = inspectGG["value_size_bytes"]

    model0 = glmGridResult["models"][0]
    alpha = model0["alpha"]
    area_under_curve = model0["area_under_curve"]
    error_0 = model0["error_0"]
    error_1 = model0["error_1"]
    key = model0["key"]
    print "best GLM model key:", key

    glm_lambda = model0["lambda"]

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, key)
    h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM))
    simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
コード例 #9
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        
        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, 
            hex_key=hex_key, timeoutSecs=30, schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, 
                timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            ntree = rfView['ntree']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
            if first is None: # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
コード例 #10
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = {"ntrees": 2, "max_depth": 300, "nbins": 200, "timeoutSecs": 600, "response": "C55"}

        paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600}

        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
コード例 #11
0
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95)
        (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95)
        (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95)
        (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2))
        self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3))


        # we're doing separate test/train splits..so we're going to get variance
        # really should not do test/train split and use all the data? if we're comparing sorted or not?
        # but need the splits to be sorted or not. I think I have those files
        self.assertAlmostEqual(ce1pct1, ce2pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1))
        self.assertAlmostEqual(ce1pct1, ce3pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
コード例 #12
0
    def test_exec2_quantile_na_scalar(self):
        h2o.beta_features = True

        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)
            print 'exec end took', time.time() - start, 'seconds'
            print h2o.dump_json(resultExec)
            # do the quantiles page on the created nah key
            kwargs = {
                'column': 0,
                'quantile': 0.4,
                'multiple_pass': 2,
            }
            h2o.nodes[0].quantiles(source_key='nah', **kwargs)

            inspect = h2o_cmd.runInspect(key='abc')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
コード例 #13
0
ファイル: test_speedrf_grid2.py プロジェクト: Brontai/h2o
    def test_RF_poker100(self):
        MISSING_RESPONSE = False
        DO_MODEL_INSPECT = False
        trees = ",".join(map(str,range(10,50,2)))
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append( (job_key, model_key) )

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key  in jobs:

            gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key)
            print "speedrf grid result for %s:", h2o.dump_json(gridResult)

            print "speedrf grid result errors:", gridResult['prediction_errors']
            for i,j in enumerate(gridResult['jobs']):
                if DO_MODEL_INSPECT:
                    print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key']))
                else:
                    # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    print "model:", h2o.dump_json(model)
コード例 #14
0
ファイル: test_store_view.py プロジェクト: 100star/h2o
    def test_A_store_view(self):
        # size of H2O store
        store_size = 0
        # import data to have more files in the system
        r = h2i.import_only(bucket='smalldata', path='iris/*')
        store_size += len(r[0]['files'])
        r = h2i.import_only(bucket='smalldata', path='covtype/*')
        store_size += len(r[0]['files'])

        # list all items
        r = h2o.nodes[0].store_view(view=store_size)
        self.assertEqual(store_size, len(r['keys']))

        # list over views including only 3 items
        items_per_page = 3                  # items per page
        pages = (store_size / items_per_page)    # number of pages
        if (store_size % items_per_page != 0): pages += 1
        offset = 0 # running offset
        cnt_items = 0  # counter of returned items
        for p in range(0,pages):
            r = h2o.nodes[0].store_view(offset=offset, view=items_per_page)
            print h2o.dump_json(r)
            cnt_items += len(r['keys']) 
            offset += items_per_page

        self.assertEqual(store_size, cnt_items)
コード例 #15
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
コード例 #16
0
ファイル: h2o_glm.py プロジェクト: brennane/h2o
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    # FIX! does error_0/1 only exist for binomial?
    for m, model in enumerate(glmGridResult['models']):
        alpha = model['alpha']
        area_under_curve = model['area_under_curve']
        # FIX! should check max error?
        error_0 = model['error_0']
        error_1 = model['error_1']
        model_key = model['key']
        print "#%s GLM model key: %s" % (m, model_key)
        glm_lambda = model['lambda']

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key'])
    h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM))
    g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)

    return g
コード例 #17
0
def exec_list(exprList, lenNodes, csvFilename, key2):
        h2e.exec_zero_list(zeroList)
        # start with trial = 1 because trial-1 is used to point to Result0 which must be initted
        trial = 1
        while (trial < 100):
            for exprTemplate in exprList:
                # do each expression at a random node, to facilate key movement
                nodeX = random.randint(0,lenNodes-1)
                colX = random.randint(1,54)
                # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now
                row = str(random.randint(1,400000))

                execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2)
                execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey="Result"+str(trial)+".hex", timeoutSecs=60)

                eri0 = execResultInspect[0]
                eri1 = execResultInspect[1]
                columns = eri0.pop('cols')
                columnsDict = columns[0]
                print "\nexecResult columns[0]:", h2o.dump_json(columnsDict)
                print "\nexecResult [0]:", h2o.dump_json(eri0)
                print "\nexecResult [1] :", h2o.dump_json(eri1)
                
                min = columnsDict["min"]
                h2o.verboseprint("min: ", min, "trial:", trial)
                ### self.assertEqual(float(min), float(trial),"what can we check here")

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                # slows things down to check every iteration, but good for isolation
                h2o.check_sandbox_for_errors()
                print "Trial #", trial, "completed\n"
                trial += 1
コード例 #18
0
ファイル: h2o_gbm.py プロジェクト: hardikk/h2o
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    model0 = glmGridResult['models'][0]
    alpha = model0['alpha']
    area_under_curve = model0['area_under_curve']
    error_0 = model0['error_0']
    error_1 = model0['error_1']
    model_key = model0['key']
    print "best GBM model key:", model_key

    glm_lambda = model0['lambda']

    # now indirect to the GBM result/model that's first in the list (best)
    inspectGBM = h2o_cmd.runInspect(None, model_key)
    h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM))
    simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
コード例 #19
0
    def test_parse_small_many(self):
        SEED = 6204672511291494176
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # can try the other two possibilities also
        eol = "\n"
        row = "a,b,c,d,e,f,g"

        # need unique key name for upload and for parse, each time
        # maybe just upload it once?
        timeoutSecs = 10
        node = h2o.nodes[0]

        # fail rate is one in 200?
        # need at least two rows (parser)
        for sizeTrial in range(10):
            size = random.randint(2,129)
            print "\nparsing with rows:", size
            csvFilename = "p" + "_" + str(size)
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            writeRows(csvPathname,row,eol,size)
            key = csvFilename
            print h2o.dump_json(key)
            for trial in range(5):
                # data key is deleted after parse now, so have to put it again
                pkey = node.put_file(csvPathname, key=key, timeoutSecs=timeoutSecs)
                key2 = csvFilename + "_" + str(trial) + ".hex"
                # just parse
                node.parse(pkey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=0.00)
                sys.stdout.write('.')
                sys.stdout.flush()
コード例 #20
0
ファイル: test_exec2_fast_locks.py プロジェクト: 100star/h2o
    def test_exec2_fast_locks(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
            
        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
コード例 #21
0
ファイル: test_put_parse4.py プロジェクト: askinss/h2o
    def test_put_parse4(self):
        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        for x in xrange (2):
            # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
            csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz')
            key = n.put_file(csvPathname)
            key2 = key + "_" + str(x) + ".hex"
            parseKey = n.parse(key, key2)

            summaryResult = n.summary_page(key2)
            # remove bin_names because it's too big (256?) and bins
            # just touch all the stuff returned
            summary = summaryResult['summary']
            print h2o.dump_json(summary)

            columnsList = summary['columns']
            for columns in columnsList:
                N = columns['N']
                name = columns['name']
                stype = columns['type']

                histogram = columns['histogram']
                bin_size = histogram['bin_size']
                bin_names = histogram['bin_names']
                bins = histogram['bins']
                nbins = histogram['bins']
                if 1==1:
                    print "\n\n************************"
                    print "name:", name
                    print "type:", stype
                    print "N:", N
                    print "bin_size:", bin_size
                    print "len(bin_names):", len(bin_names)
                    print "len(bins):", len(bins)
                    print "len(nbins):", len(nbins)

                # not done if enum
                if stype != "enum":
                    smax = columns['max']
                    smin = columns['min']
                    percentiles = columns['percentiles']
                    thresholds = percentiles['thresholds']
                    values = percentiles['values']
                    mean = columns['mean']
                    sigma = columns['sigma']
                    if 1==1:
                        print "len(max):", len(smax)
                        print "len(min):", len(smin)
                        print "len(thresholds):", len(thresholds)
                        print "len(values):", len(values)
                        print "mean:", mean
                        print "sigma:", sigma

            ### print 'Trial:', trial
            sys.stdout.write('.')
            sys.stdout.flush()
            trial += 1
コード例 #22
0
ファイル: test_rf_brutal2_fvec.py プロジェクト: Brontai/h2o
    def test_RF(self):
        h2o.beta_features = True

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0, 
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
コード例 #23
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        self.assertEqual(4.29, classification_error1)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        self.assertEqual(4.29, classification_error2)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
コード例 #24
0
ファイル: test_parse_cust.py プロジェクト: 100star/h2o
    def test_parse_cust(self):
        # run as user 0xcustomer to get access (with .json config and ssh key file specified)
        importFolderPath = '/mnt/0xcustomer-datasets'
        pollTimeoutSecs = 120
        retryDelaySecs = 30
        timeoutSecs = 300
        
        (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*")
        importFileList = importResult['files']
        importFailList = importResult['fails']
        importKeyList = importResult['keys']
        importDelList = importResult['dels']

        if len(importDelList)!=0:
            raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList))

        if len(importFileList)<MINFILES:
            raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList))

        if len(importKeyList)<MINFILES:
            raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList))

        if len(importFailList)!=0:
            raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList))


        # only parse files with .csv or .tsv in their name (no dirs like that?)
        goodKeyList = [key for key in importKeyList if ('.csv' in key  or '.tsv' in key)]
        trial = 0
        # just do 1?
        for i, importKey in enumerate(random.sample(goodKeyList,3)):
            print "importKey:", importKey
            trial +=1

            start = time.time() 
            # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like
            # force header=0..should mean headers get treated as NAs
            parseResult = h2i.parse_only(pattern=importKey, header=0,
                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            origKey = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=origKey)
            h2o_cmd.infoFromInspect(inspect, origKey)

            execExpr = 'newKey = '+origKey+'[1,1]'
            h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
            newParseKey = {'destination_key': 'newKey'}

            h2o_cmd.checkKeyDistribution()
            h2o.nodes[0].remove_key(key=origKey)
            # a key isn't created for a scalar
            # h2o.nodes[0].remove_key(key='newKey')
        
        self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
コード例 #25
0
ファイル: h2o_exec.py プロジェクト: EPBaron/h2o
def checkScalarResult(resultExec, resultKey, allowEmptyResult=False):
    # make the common problems easier to debug
    h2o.verboseprint("checkScalarResult resultExec:", h2o.dump_json(resultExec))

    if "funstr" not in resultExec:
        emsg = "checkScalarResult: 'funstr' missing"
    if "result" not in resultExec:
        emsg = "checkScalarResult: 'result' missing"
    if "scalar" not in resultExec:
        emsg = "checkScalarResult: 'scalar' missing"
    if "num_cols" not in resultExec:
        emsg = "checkScalarResult: 'num_cols' missing"
    if "num_rows" not in resultExec:
        emsg = "checkScalarResult: 'num_rows' missing"
    elif "cols" not in resultExec:
        emsg = "checkScalarResult: 'cols' missing"
    else:
        emsg = None
        num_cols = resultExec["num_cols"]
        num_rows = resultExec["num_rows"]
        cols = resultExec["cols"]
        # print "cols:", h2o.dump_json(cols)

    if emsg:
        print "\nKey: '" + str(resultKey) + "' resultExec:\n", h2o.dump_json(resultExec)
        sys.stdout.flush()
        raise Exception("exec result (resultExec) missing what we expected. Look at json above. " + emsg)

    if (cols and (not num_rows or num_rows == 0)) and not allowEmptyResult:
        print "resultExec[0]:", h2o.dump_json(resultExec)
        raise Exception(
            "checkScalarResult says 'cols' exist in exec json response,"
            + " but num_rows: %s is 0 or None. Is that an expected 'empty' key state?" % num_rows
            + " Use 'allowEmptyResult if so."
        )

    # Cycle thru rows and extract all the meta-data into a dict?
    # assume "0" and "row" keys exist for each list entry in rows
    # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here

    # cols may not exist..if the result was just scalar?
    if not cols:
        # just return the scalar result then
        scalar = resultExec["scalar"]
        if scalar is None:
            raise Exception("both cols and scalar are null: %s %s" % (cols, scalar))
        checkForBadFP(scalar, json=resultExec)
        return scalar

    metaDict = cols[0]
    for key, value in metaDict.items():
        print "Inspect metaDict:", key, value

    min_value = metaDict["min"]
    stype = metaDict["type"]
    # if it's an enum col, it's okay for min to be NaN ..
    checkForBadFP(min_value, nanOkay=stype == "Enum", json=metaDict)
    return min_value
コード例 #26
0
    def test_json_browse_both_exec(self):
        lenNodes = len(h2o.nodes)
        csvPathname = 'standard/covtype.data'
        hex_key = 'c.hex'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
        print "\nParse key is:", parseResult['destination_key']

        ## h2b.browseTheCloud()
        # for trial in range(53):
        trial = 0
        while (trial < 100):
            for exprTemplate in exprList:
                trial = trial + 1
                n = trial
                colX = random.randint(1,54)
                row = random.randint(1,400000)

                execExpr = exprTemplate
                execExpr = re.sub('<col1>',str(colX),execExpr)
                execExpr = re.sub('<col2>',str(colX+1),execExpr)
                execExpr = re.sub('<n>',str(n),execExpr)
                execExpr = re.sub('<row>',str(row),execExpr)
                execExpr = re.sub('<keyX>',str(hex_key),execExpr)

                # pick a random node to execute it on
                randNode = random.randint(0,lenNodes-1)
                print "\nexecExpr:", execExpr, "on node", randNode

                start = time.time()
                resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], 
                    execExpr=execExpr, timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(resultExec))
                # print(h2o.dump_json(resultExec))

                # FIX! race conditions. If json is done, does that mean you can inspect it??
                # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist
                if trial > 1:
                    inspectMe = random.choice(inspectList)
                    resultInspect = h2o.nodes[0].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[1].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[2].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                # FIX! if we race the browser doing the exec too..it shouldn't be a problem?
                # might be a bug?

                # WARNING! we can't browse the Exec url history, since that will 
                # cause the Exec to execute again thru the browser..i.e. it has side effects
                # just look at the last inspect, which should be the resultInspect!
                # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                h2b.browseJsonHistoryAsUrlLastMatch("Exec")
                h2o.check_sandbox_for_errors()
                print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
                print "Trial #", trial, "completed\n"
コード例 #27
0
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
コード例 #28
0
ファイル: test_exec2_xorsum2.py プロジェクト: ashty/h2o
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 1, 'r1', 0, 10, None),
        ]

        ullResultList = []
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # dynamic range of the data may be useful for estimating error
            maxDelta = expectedMax - expectedMin

            csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            print "Creating random", csvPathname
            (expectedUll, expectedFpSum)  = write_syn_dataset(csvPathname, 
                rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, 
                timeoutSecs=3000, retryDelaySecs=2)
            inspect = h2o_cmd.runInspect(key=hex_key)
            print "numRows:", inspect['numRows']
            print "numCols:", inspect['numCols']
            inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
            print "inspect offset = -1:", h2o.dump_json(inspect)

            
            # looking at the 8 bytes of bits for the h2o doubles
            # xorsum will zero out the sign and exponent
            for execExpr in exprList:
                start = time.time()
                (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                    resultKey=None, timeoutSecs=300)
                print 'exec took', time.time() - start, 'seconds'
                print "execResult:", h2o.dump_json(execResult)
                print ""
                print "%30s" % "fpResult:", "%.15f" % fpResult
                ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult
                print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll
                # print "%30s" % "hex(bitResult):", hex(ullResult)
                ullResultList.append((ullResult, fpResult))

            h2o.check_sandbox_for_errors()

            print "first result was from a sum. others are xorsum"
            print "ullResultList:"
            for ullResult, fpResult in ullResultList:
                print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
            expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll)
            print "%30s" % "expectedUll (0.16x):", "0x%0.16x   %s" % (expectedUll, expectedUllAsDouble)
            expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)
            print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
コード例 #29
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'}
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
コード例 #30
0
    def test_rf_covtype_fvec(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(jobDispatch)
            
            # don't poll for fvec 
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs)
            elapsed = time.time() - start
            print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            print h2o.dump_json(rfResult)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = kwargs['model_key']
            rfView['ntree'] = kwargs['ntree']
            rfViewInitial.append(rfView)

            print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)


        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
コード例 #31
0
    def test_GBM_cancel_model_reuse(self):
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?"
            (importResult,
             importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                              path=csvPathname,
                                              schema='local',
                                              timeoutSecs=50)
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                schema='local',
                hex_key='c.hex',
                timeoutSecs=500,
                noPoll=False,
                doSummary=False
            )  # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            # print "\nparseResult", h2o.dump_json(parseResult)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error:
            # Only integer or enum/factor columns can be classified

            if DO_CLASSIFICATION:
                # need to flip the right col! (R wise)
                execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1,
                                                         response + 1)
                kwargs = {'str': execExpr}
                resultExec = h2o_cmd.runExec(**kwargs)

            # lets look at the response column now
            s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
            # x = range(542)
            # remove the output too! (378)
            ignoreIndex = [
                3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425,
                426, 540, 541, response
            ]
            # have to add 1 for col start with 1, now. plus the C
            xIgnore = ",".join(["C" + str(i + 1) for i in ignoreIndex])

            params = {
                'destination_key': None,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response + 1),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
            }

            kwargs = params.copy()
            timeoutSecs = 1800

            for i in range(5):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    # rjson error in poll_url: Job was cancelled by user!
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                                    noPoll=True,
                                                    **kwargs)
                    jobids.append(GBMFirstResult['job_key'])
                    h2o.check_sandbox_for_errors()

                    # try ray's 'models' request to see if anything blows up
                    modelsParams = {
                        'key': None,
                        'find_compatible_frames': 0,
                        'score_frame': None
                    }
                    modelsResult = h2o.nodes[0].models(timeoutSecs=10,
                                                       **modelsParams)
                    print "modelsResult:", h2o.dump_json(modelsResult)

                # have to pass the job id
                # for j in jobids:
                #     h2o.nodes[0].jobs_cancel(key=j)

                h2o_jobs.cancelAllJobs()
                # PUB-361. going to wait after cancel before reusing keys
                time.sleep(3)
                # am I getting a subsequent parse job cancelled?
                h2o_jobs.showAllJobs()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename,
                                                   importResult=importResult)
コード例 #32
0
    def test_parse_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # just do the import folder once
        importFolderPath = "libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 0, 9.0, False, False),
            ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True),
            # multi-label target like 1,2,5 ..not sure what that means
            # ("tmc2007_train.svm",  "cJ", 30, 0, 21.0, False, False),
            # illegal non-ascending cols
            # ("syn_6_1000_10.svm",  "cK", 30, -36, 36, True, False),
            # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False),
            # fails csvDownload
            ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False),
            ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False),
            ("news20.svm", "cH", 30, 1, 20.0, False, False),
            ("connect4.svm", "cB", 30, -1, 1.0, False, False),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)
            ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False),
            ("mushrooms.svm", "cG", 30, 1, 2.0, False, False),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, expectedCol0Min,
             expectedCol0Max, enableDownloadReparse,
             enableSizeChecks) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print csvPathname, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspectFirst = h2o_cmd.runInspect(None,
                                              parseResult['destination_key'],
                                              timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
            # look at the min/max for the target col (0) and compare to expected for the dataset

            imin = float(inspectFirst['cols'][0]['min'])
            # print h2o.dump_json(inspectFirst['cols'][0])
            imax = float(inspectFirst['cols'][0]['max'])

            if expectedCol0Min:
                self.assertEqual(
                    imin,
                    expectedCol0Min,
                    msg='col %s min %s is not equal to expected min %s' %
                    (0, imin, expectedCol0Min))
            if expectedCol0Max:
                h2o_util.assertApproxEqual(
                    imax,
                    expectedCol0Max,
                    tol=0.00000001,
                    msg='col %s max %s is not equal to expected max %s' %
                    (0, imax, expectedCol0Max))

            print "\nmin/max for col0:", imin, imax

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            if DO_SUMMARY:
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseResult['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)
                summaryResult = h2o_cmd.runSummary(key=hex_key,
                                                   timeoutSecs=360)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            if DO_DOWNLOAD_REPARSE and enableDownloadReparse:
                missingValuesListA = h2o_cmd.infoFromInspect(
                    inspectFirst, csvPathname)
                num_colsA = inspectFirst['num_cols']
                num_rowsA = inspectFirst['num_rows']
                row_sizeA = inspectFirst['row_size']
                value_size_bytesA = inspectFirst['value_size_bytes']

                # do a little testing of saving the key as a csv
                csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv"
                print "Trying csvDownload of", csvDownloadPathname
                h2o.nodes[0].csv_download(src_key=hex_key,
                                          csvPathname=csvDownloadPathname)

                # remove the original parsed key. source was already removed by h2o
                # don't have to now. we use a new name for hex_keyB
                # h2o.nodes[0].remove_key(hex_key)
                start = time.time()
                hex_keyB = hex_key + "_B"
                parseResultB = h2o_cmd.parseResult = h2i.import_parse(
                    path=csvDownloadPathname, schema='put', hex_key=hex_keyB)
                print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \
                    csvFilename, 'took', time.time() - start, 'seconds'
                inspect = h2o_cmd.runInspect(key=hex_keyB)

                missingValuesListB = h2o_cmd.infoFromInspect(
                    inspect, csvPathname)
                num_colsB = inspect['num_cols']
                num_rowsB = inspect['num_rows']
                row_sizeB = inspect['row_size']
                value_size_bytesB = inspect['value_size_bytes']

                df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)

                for i, d in enumerate(df.difference):
                    # ignore mismatches in these
                    #  "variance"
                    #  "response.time"
                    #  "key"
                    if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d:
                        pass
                    else:
                        raise Exception(
                            "testing %s, found unexpected mismatch in df.difference[%d]: %s"
                            % (csvPathname, i, d))

                if DO_SIZE_CHECKS and enableSizeChecks:
                    # if we're allowed to do size checks. ccompare the full json response!
                    print "Comparing original inspect to the inspect after parsing the downloaded csv"
                    # vice_versa=True

                    # ignore the variance diffs. reals mismatch when they're not?
                    filtered = [
                        v for v in df.difference if not 'variance' in v
                    ]
                    self.assertLess(len(filtered), 3,
                        msg="Want < 3, not %d differences between the two rfView json responses. %s" % \
                            (len(filtered), h2o.dump_json(filtered)))

                    # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes
                    # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen
                    # make the check conditional based on the dataset
                    self.assertEqual(
                        row_sizeA, row_sizeB,
                        "row_size mismatches after re-parse of downloadCsv result %d %d"
                        % (row_sizeA, row_sizeB))
                    h2o_util.assertApproxEqual(
                        value_size_bytesA,
                        value_size_bytesB,
                        tol=0.00000001,
                        msg=
                        "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                        % (value_size_bytesA, value_size_bytesB))

                print "missingValuesListA:", missingValuesListA
                print "missingValuesListB:", missingValuesListB
                self.assertEqual(
                    missingValuesListA, missingValuesListB,
                    "missingValuesList mismatches after re-parse of downloadCsv result"
                )
                self.assertEqual(
                    num_colsA, num_colsB,
                    "num_cols mismatches after re-parse of downloadCsv result %d %d"
                    % (num_colsA, num_colsB))
                self.assertEqual(
                    num_rowsA, num_rowsB,
                    "num_rows mismatches after re-parse of downloadCsv result %d %d"
                    % (num_rowsA, num_rowsB))

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
コード例 #33
0
ファイル: h2o_perf.py プロジェクト: ytham/h2o
    def log_iostats(self, initOnly=False):
        if ((self.snapshotTime - self.pollStats['lastJstackTime']) <
                self.IOSTATSINTERVAL):
            return

        DO_IO_RW = True
        DO_IOP = True
        DO_BLOCKED = False

        node = h2o.nodes[0]
        stats = node.iostatus()
        ### h2o.verboseprint("log_iostats:", h2o.dump_json(stats))
        histogram = stats['histogram']

        def log_window(k, w):
            ## in case the window disappears from h2o, print what's available with this line
            ## print k['window']
            if k['window'] == w:
                i_o = k['i_o']
                node = k['cloud_node_idx']
                if k['r_w'] == 'read':
                    r_w = 'rd'
                elif k['r_w'] == 'write':
                    r_w = 'wr'
                else:
                    r_w = k['r_w']

                for l, v in k.iteritems():
                    fmt = "iostats: window{:<2d} node {:d} {:<4s} {:s} {:s} MB/sec: {:6.2f}"
                    if 'peak' in l:
                        ## logging.critical(fmt.format(w, node, i_o, r_w, "peak", (v/1e6)))
                        pass
                    if 'effective' in l:
                        logging.critical(
                            fmt.format(w, node, i_o, r_w, "eff.", (v / 1e6)))
                return True
            else:
                return False  # not found

        if DO_IO_RW:
            print "\nlog_iotstats probing node:", str(node.addr) + ":" + str(
                node.port)
            found = False
            for k in histogram:
                ### print k
                found |= log_window(k, 60)
                ### log_window(30)
            if not found:
                print "iostats: desired window not found in histogram"
                # 1 5 60 300 available

        # we want to sort the results before we print them, so grouped by node
        if DO_IOP:
            iopList = []
            raw_iops = stats['raw_iops']
            ### print
            for k in raw_iops:
                ### print k
                node = k['node']
                i_o = k['i_o']
                r_w = k['r_w']
                size = k['size_bytes']
                blocked = k['blocked_ms']
                duration = k['duration_ms']
                if duration != 0:
                    blockedPct = "%.2f" % (100 * blocked / duration) + "%"
                else:
                    blockedPct = "no duration"
                iopMsg = "node: %s %s %s %d bytes. blocked: %s" % (
                    node, i_o, r_w, size, blockedPct)
                # FIX! don't dump for now
                iopList.append([node, iopMsg])

            iopList.sort(key=lambda iop: iop[0])  # sort by node
            totalSockets = len(iopList)
            # something wrong if 0?
            if totalSockets == 0:
                print "WARNING: is something wrong with this io stats response?"
                print h2o.dump_json(stats)

            logging.critical("iostats: " + "Total sockets: " +
                             str(totalSockets))
            if DO_BLOCKED:
                for i in iopList:
                    logging.critical("iostats:" + i[1])

        # don't save anything
        self.save(iostats=True)
コード例 #34
0
ファイル: test_ddply_plot.py プロジェクト: chouclee/h2o
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DO_KNOWN_FAIL:
            tryList = [
                (1000000, 5, 'cD', 0, 320, 30), 
            ]
        else:
            tryList = [
                (1000000, 5, 'cD', 0, 10, 30), 
                (1000000, 5, 'cD', 0, 20, 30), 
                (1000000, 5, 'cD', 0, 40, 30), 
                (1000000, 5, 'cD', 0, 50, 30), 
                (1000000, 5, 'cD', 0, 80, 30), 
                (1000000, 5, 'cD', 0, 160, 30), 
                # fails..don't do
                # (1000000, 5, 'cD', 0, 320, 30), 
                # (1000000, 5, 'cD', 0, 320, 30), 
                # starts to fail here. too many groups?
                # (1000000, 5, 'cD', 0, 640, 30), 
                # (1000000, 5, 'cD', 0, 1280, 30), 
                ]

        if DO_APPEND_KNOWN_FAIL2:
            tryList.append(
                (1000000, 5, 'cD', 0, 160, 30), 
            )
            tryList.append(
                (1000000, 5, 'cD', 0, 320, 30), 
            )
        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            if DO_KNOWN_FAIL:
                # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails
                # csvFilename = 'a1' # fails
                csvFilename = "syn_ddply_1Mx5_0_320.gz"
                bucket = "home-0xdiag-datasets"
                csvPathname = "standard/" + csvFilename
                minInt = 0
                maxInt = 320
            else:
                bucket = None
                csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                print "Creating random", csvPathname, "with range", (maxInt-minInt)+1
                write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE)

            for lll in range(1):
                # PARSE train****************************************
                hexKey = 'r.hex'
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey)
                inspect = h2o_cmd.runInspect(key=hexKey)
                missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename)
                self.assertEqual(missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList)

                for resultKey, execExpr in initList:
                    h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)

                #*****************************************************************************************
                # two columns. so worse case every combination of each possible value
                # only true if enough rows (more than the range?)
                maxExpectedGroups = ((maxInt - minInt) + 1) ** 2
                # do it twice..to get the optimal cached delay for time?
                execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(groups, maxExpectedGroups,  rel=0.2, 
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a1dump = h2o_cmd.runInspect(key="a1")
                print "a1", h2o.dump_json(a1dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1")
                self.assertEqual(missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial))

                #*****************************************************************************************

                execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(groups, maxExpectedGroups,  rel=0.2, 
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a2dump = h2o_cmd.runInspect(key="a2")
                print "a2", h2o.dump_json(a2dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2")
                self.assertEqual(missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial))

                #*****************************************************************************************
                # should be same answer in both cases
                execExpr = "sum(a1!=a2)==0"
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                execExpr = "s=c(0); s=(a1!=a2)"
                (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                print "execResult", h2o.dump_json(execResult)

                #*****************************************************************************************

                # should never have any NAs in this result
                sdump = h2o_cmd.runInspect(key="s")
                print "s", h2o.dump_json(sdump)
                self.assertEqual(result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult)))

                # xList.append(ntrees)
                trial += 1
                # this is the biggest it might be ..depends on the random combinations
                # groups = ((maxInt - minInt) + 1) ** 2
                xList.append(groups)
                eList.append(ddplyElapsed)
                fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
コード例 #35
0
ファイル: test_iostatus.py プロジェクト: zed9/h2o
    def test_iostatus(self):
        # wait a bit first?
        time.sleep(5)
        # Ask each node for iostatus statistics
        for node in h2o.nodes:
            stats = node.iostatus()
            h2o.verboseprint(h2o.dump_json(stats))
            histogram = stats['histogram']
            # {
            # u'i_o': u'TCP',
            # u'peak_bytes_/_sec': 199690496.78920883,
            # u'effective_bytes_/_sec': 21850666.666666668,
            # u'r_w': u'write',
            # u'cloud_node_idx': 2,
            # u'window': 10
            # }
            print "\nProbing node:", str(node.addr) + ":" + str(node.port)
            for k in histogram:
                ### print k
                if k['window'] == 10:
                    i_o = k['i_o']
                    node = k['cloud_node_idx']
                    r_w = k['r_w']

                    for l, v in k.iteritems():
                        fmt = "iostats: window10 node {:d} {:s} {:s} {:s} MB/sec: {:.2f}"
                        if 'peak' in l:
                            print fmt.format(node, i_o, r_w, "peak", (v / 1e6))
                        if 'effective' in l:
                            print fmt.format(node, i_o, r_w, "eff.", (v / 1e6))


# {
# u'node': u'/192.168.0.37:54321',
# u'i_o': u'TCP',
# u'closeTime': '10:31:47:370',
# u'r_w': u'write',
# u'duration_ms': 4,
# u'blocked_ns': 463132,
# u'size_bytes': 65552
# }
# we want to sort the results before we print them, so grouped by node
            iopList = []
            raw_iops = stats['raw_iops']
            print
            for k in raw_iops:
                ### print k
                node = k['node']
                i_o = k['i_o']
                r_w = k['r_w']
                size = k['size_bytes']
                blocked = k['blocked_ns']
                duration = k['duration_ms'] * 1e6  # convert to ns
                if duration != 0:
                    blockedPct = "%.2f" % (100 * blocked / duration) + "%"
                else:
                    blockedPct = "no duration"
                iopMsg = "node: %s %s %s %d bytes. blocked: %s" % (
                    node, i_o, r_w, size, blockedPct)
                iopList.append([node, iopMsg])

            iopList.sort(key=lambda iop: iop[0])  # sort by node
            totalSockets = len(iopList)
            # something wrong if 0?
            if totalSockets == 0:
                print "WARNING: is something wrong with this io stats response?"
                print h2o.dump_json(stats)

            print "iostats: Total sockets:", totalSockets
            for i in iopList:
                print "iostats:", i[1]
コード例 #36
0
def kmeans_doit(self,
                csvFilename,
                bucket,
                csvPathname,
                num_rows,
                timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   schema='put',
                                   hex_key=csvFilename + ".hex",
                                   timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        'k': 1,
        'initialization': 'Furthest',
        'destination_key': 'KMeansModel.hex',
        'max_iter': 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers,
     tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname,
                                                   parseResult, 'd', **kwargs)

    expected = [([
        -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741,
        0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153,
        0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314,
        0.0021324000161308796, 0.00154
    ], num_rows, None)]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self,
                                        tupleResultList,
                                        expected,
                                        allowedDelta,
                                        trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    # inspect doesn't work
    # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'])
    # KMeansModel = inspect['KMeansModel']
    modelView = h2o.nodes[0].kmeans_model_view(model='KMeansModel.hex')
    h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
    model = modelView['model']
    clusters = model['centers']
    within_cluster_variances = model['within_cluster_variances']
    total_within_SS = model['total_within_SS']
    print "within_cluster_variances:", within_cluster_variances
    print "total_within_SS:", total_within_SS

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
コード例 #37
0
    def test_parse_bounds_csv_fvec(self):
        h2o.beta_features = True
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 50, 'cC', 300),
            (1000, 999, 'cC', 300),
            (1000, 1000, 'cA', 300),
            # (1000, 100000, 'cB', 300),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            # dict of col sums for comparison to exec col sums below
            synSumList = write_syn_dataset(csvPathname, rowCount, colCount,
                                           SEEDPERFILE)

            # PARSE**********************
            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put',
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT*******************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs)
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            iCols = inspect['cols']
            iStats = []
            for stats in iCols:
                iName = stats['name']
                # just touching to make sure they are there
                iNaCnt = stats['naCnt']
                iMin = float(stats['min'])
                iMax = float(stats['max'])
                iMean = float(stats['mean'])
                iStats.append({
                    'name': iName,
                    'naCnt': iNaCnt,
                    'min': iMin,
                    'max': iMax,
                    'mean': iMean,
                })

            # SUMMARY********************************
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_ncols=colCount,
                                               timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(rowCount,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (rowCount, numRows))

            columnsList = summaryResult['summaries']
            self.assertEqual(
                colCount,
                len(columnsList),
                msg=
                "generated %s cols (including output).  summary has %s columns"
                % (colCount, len(columnsList)))

            c = 0
            for column in columnsList:
                # get info from the inspect col for comparison
                iMin = iStats[c]['min']
                iMax = iStats[c]['max']
                iMean = iStats[c]['mean']
                iNaCnt = iStats[c]['naCnt']
                c += 1

                colname = column['colname']
                stats = column['stats']
                stype = column['type']
                hstep = column['hstep']
                hbrk = column['hstep']
                hstart = column['hstart']

                smax = stats['maxs']
                smin = stats['mins']
                sd = stats['sd']
                smean = stats['mean']
                # no zeroes if enum, but we're not enum here
                zeros = stats['zeros']

                self.assertEqual(
                    iMin, smin[0],
                    "inspect min %s != summary min %s" % (iMin, smin))
                self.assertEqual(
                    iMax, smax[0],
                    "inspect max %s != summary max %s" % (iMax, smax))
                self.assertEqual(
                    iMean, smean,
                    "inspect mean %s != summary mean %s" % (iMean, smean))
                # no comparison for 'zeros'

                # now, also compare expected values
                if colname == "V1":
                    synNa = 0
                    # can reverse-engineer the # of zeroes, since data is always 1
                    synSum = synSumList[
                        1]  # could get the same sum for all ccols
                    synZeros = numRows - synSum
                    synSigma = 0.50
                    synMean = (synSum + 0.0) / numRows
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                elif colname == "V2":
                    synSum = 0
                    synSigma = 0
                    synMean = 0
                    if DO_NAN:
                        synZeros = 0
                        synNa = numRows
                        synMin = []
                        synMax = []
                    else:
                        synZeros = numRows
                        synNa = 0
                        synMin = [0.0]
                        synMax = [0.0]

                # a single 1 in the last col
                elif colname == "V" + str(colCount -
                                          1):  # h2o puts a "V" prefix
                    synNa = 0
                    synSum = synSumList[colCount - 1]
                    synZeros = numRows - 1
                    # stddev.p
                    # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx

                    synMean = 1.0 / numRows  # why does this need to be a 1 entry list
                    synSigma = math.sqrt(pow((synMean - 1), 2) / numRows)
                    print "last col with single 1. synSigma:", synSigma
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                else:
                    synNa = 0
                    synSum = 0
                    synZeros = numRows
                    synSigma = 0.0
                    synMean = 0.0
                    synMin = [0.0]
                    synMax = [0.0]

                if DO_MEAN:
                    self.assertAlmostEqual(
                        float(smean),
                        synMean,
                        places=6,
                        msg='col %s mean %s is not equal to generated mean %s'
                        % (colname, smean, synMean))

                # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                self.assertTrue(
                    smin >= synMin,
                    msg='col %s min %s is not >= generated min %s' %
                    (colname, smin, synMin))

                self.assertTrue(
                    smax <= synMax,
                    msg='col %s max %s is not <= generated max %s' %
                    (colname, smax, synMax))

                # reverse engineered the number of zeroes, knowing data was always 1 if present?
                if colname == "V65536" or colname == "V65537":
                    print "columns around possible zeros mismatch:", h2o.dump_json(
                        columns)

                self.assertEqual(
                    zeros,
                    synZeros,
                    msg='col %s zeros %s is not equal to generated zeros %s' %
                    (colname, zeros, synZeros))
コード例 #38
0
ファイル: h2o_cmd.py プロジェクト: vkuznet/h2o
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
    if not summaryResult:
        raise Exception("summaryResult is empty for infoFromSummary")
    if h2o.beta_features:
        # names = summaryResult['names']
        # means = summaryResult['means']
        summaries = summaryResult['summaries']

        # what if we didn't get the full # of cols in this summary view?
        # I guess the test should deal with that
        if 1 == 0 and numCols and (len(summaries) != numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" %
                            (numCols, len(summaries)))

        for column in summaries:
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']
            h2o_exec.checkForBadFP(
                nacnt,
                'nacnt for colname: %s stattype: %s' % (colname, stattype))

            if stattype == 'Enum':
                cardinality = stats['cardinality']
                h2o_exec.checkForBadFP(
                    cardinality, 'cardinality for colname: %s stattype: %s' %
                    (colname, stattype))

            else:
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                # check for NaN/Infinity in some of these
                # apparently we can get NaN in the mean for a numerica col with all NA?
                h2o_exec.checkForBadFP(mean,
                                       'mean for colname: %s stattype: %s' %
                                       (colname, stattype),
                                       nanOkay=True,
                                       infOkay=True)
                h2o_exec.checkForBadFP(sd,
                                       'sd for colname: %s stattype %s' %
                                       (colname, stattype),
                                       nanOkay=True,
                                       infOkay=True)
                h2o_exec.checkForBadFP(
                    zeros,
                    'zeros for colname: %s stattype %s' % (colname, stattype))

                if numRows and (nacnt == numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (
                        colname, stattype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows))
                        print "Why is min[] empty for a %s col (%s) ? %s %s %s" % (
                            mins, stattype, colname, nacnt, numRows)
                    if not maxs:
                        # this is failing on maprfs best buy...why? (va only?)
                        print h2o.dump_json(column)
                        # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows))
                        print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (
                            maxs, stattype, colname, nacnt, numRows)

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            if not noPrint:
                print "\n\n************************"
                print "colname:", colname
                print "coltype:", coltype
                print "nacnt:", nacnt

                print "stattype:", stattype
                if stattype == 'Enum':
                    print "cardinality:", cardinality
                else:
                    print "mean:", mean
                    print "sd:", sd
                    print "zeros:", zeros
                    print "mins:", mins
                    print "maxs:", maxs
                    print "pct:", pct
                    print "pctile:", pctile

                # histogram stuff
                print "hstart:", hstart
                print "hstep:", hstep
                print "hbrk:", hbrk
                print "hcnt:", hcnt

    else:
        summary = summaryResult['summary']
        columnList = summary['columns']
        # can't get the right number of columns in summary? have to ask for more cols (does va support >  1000)
        if 1 == 0 and numCols and (len(columnList) != numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" %
                            (numCols, len(columnList)))
        for column in columnList:
            N = column['N']
            # self.assertEqual(N, rowCount)
            name = column['name']
            stype = column['type']
            histogram = column['histogram']
            bin_size = histogram['bin_size']
            bin_names = histogram['bin_names']
            # if not noPrint:
            #     for b in bin_names:
            #        print "bin_name:", b

            bins = histogram['bins']
            nbins = histogram['bins']
            if not noPrint:
                print "\n\n************************"
                print "N:", N
                print "name:", name
                print "type:", stype
                print "bin_size:", bin_size
                print "len(bin_names):", len(bin_names), bin_names
                print "len(bins):", len(bins), bins
                print "len(nbins):", len(nbins), nbins

            # not done if enum
            if stype != "enum":
                zeros = column['zeros']
                na = column['na']
                maxs = column['max']
                mins = column['min']
                mean = column['mean']
                sigma = column['sigma']
                if not noPrint:
                    print "zeros:", zeros
                    print "na:", na
                    print "maxs:", maxs
                    print "mins:", mins
                    print "mean:", mean
                    print "sigma:", sigma

                if numRows and (na == numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (
                        name, stype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        raise Exception(
                            "Why is min[] empty for a %s col (%s) ? %s %s %s" %
                            (mins, stype, N, na, numRows))
                    if not maxs:
                        print h2o.dump_json(column)
                        # bestbuy dataset in maprfs is failing this ..for va only? not sure why. some nas?
                        print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (
                            maxs, stype, N, na, numRows)

                # sometimes we don't get percentiles? (if 0 or 1 bins?)
                if len(bins) >= 2:
                    percentiles = column['percentiles']
                    thresholds = percentiles['thresholds']
                    values = percentiles['values']

                    if not noPrint:
                        # h2o shows 5 of them, ordered
                        print "len(max):", len(maxs), maxs
                        print "len(min):", len(mins), mins
                        print "len(thresholds):", len(thresholds), thresholds
                        print "len(values):", len(values), values

                    for v in values:
                        # 0 is the most max or most min
                        if not v >= mins[0]:
                            m = "Percentile value %s should all be >= the min dataset value %s" % (
                                v, mins[0])
                            raise Exception(m)
                        if not v <= maxs[0]:
                            m = "Percentile value %s should all be <= the max dataset value %s" % (
                                v, maxs[0])
                            raise Exception(m)
コード例 #39
0
    def test_GBMGrid_basic_many(self):
        trainFilename = 'prostate.csv'
        train_key = 'prostate.hex'
        timeoutSecs = 300
        csvPathname = "logreg/" + trainFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=train_key,
                                       schema='put')

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': '[ID]',  # this has to have []
            'score_each_iteration': True,
            'response_column': 'CAPSULE',
            'do_classification': 1 if DO_CLASSIFICATION else 0,
            # 'balance_classes':
            # 'max_after_balance_size':
            'ntrees': '8, 10',
            'max_depth': '8, 9',
            'min_rows': '1, 2',
            'nbins': 40,
            'learn_rate': '.1, .2',
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            'variable_importance': False,
            # 'seed':
        }

        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0

        for i in range(5):
            modelKey = 'GBMGrid_prostate_%s', i
            bmResult = h2o.n0.build_model(algo='gbm',
                                          destination_key=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')
            print "GBMResult:", h2o.dump_json(bm)

            job_key = bm.job_key
            model_key = bm.destination_key
            jobs.append((job_key, model_key))
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start
        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs

        for job_key, model_key in jobs:
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
コード例 #40
0
ファイル: h2o_jobs.py プロジェクト: segahm/h2o
def pollWaitJobs(pattern=None,
                 timeoutSecs=30,
                 pollTimeoutSecs=30,
                 retryDelaySecs=5,
                 benchmarkLogging=None,
                 stallForNJobs=-1):
    anyBusy = True
    waitTime = 0
    while (anyBusy):
        patternKeys = []
        # timeout checking has to move in here now! just count loops
        anyBusy = False
        a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs)
        ## print "jobs_admin():", h2o.dump_json(a)
        jobs = a['jobs']
        stall = -1
        if stallForNJobs != -1:
            stall = 0
            for j in jobs:
                stall += 1 if j['end_time'] == '' else 0
            if stall <= stallForNJobs: break
            print str(stall), " jobs in progress.", "Waiting to poll on ", str(
                stallForNJobs), " jobs."
        for j in jobs:
            ### h2o.verboseprint(j)
            # save the destination keys for any GLMModel in progress
            if pattern and pattern in j['destination_key']:
                patternKeys.append(j['destination_key'])

            if j['end_time'] == '':
                anyBusy = True
                h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\
                    "destination_key:", j['destination_key'], \
                    "progress:",  j['progress'], \
                    "cancelled:", j['cancelled'],\
                    "end_time:",  j['end_time'])
            else:
                if stallForNJobs != -1:
                    stall -= 1
                    if stall <= stallForNJobs:
                        anyBusy = False
                        break
                    print str(
                        stall
                    ), " jobs in progress.", "Waiting to poll on ", str(
                        stallForNJobs), " jobs."
        ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if (anyBusy and waitTime > timeoutSecs):
            print h2o.dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after",
                            timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)
    return patternKeys
コード例 #41
0
    def test_quantile_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            h2o.beta_features = True
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
コード例 #42
0
    def test_parse_bounds_libsvm (self):
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        ## h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 100000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False)
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                row_size = inspect['row_size']
                value_size_bytes = inspect['value_size_bytes']
                print "\n" + csvPathname, \
                    "    num_rows:", "{:,}".format(num_rows), \
                    "    num_cols:", "{:,}".format(num_cols), \
                    "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                    "    row_size:", "{:,}".format(row_size)

                expectedRowSize = num_cols * 1 # plus output
                expectedValueSize = expectedRowSize * num_rows
                self.assertEqual(row_size, expectedRowSize,
                    msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                    (row_size, expectedRowSize))
                self.assertEqual(value_size_bytes, expectedValueSize,
                    msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                    (value_size_bytes, expectedValueSize))


                summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))
                self.assertEqual(rowCount, num_rows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))

                summary = summaryResult['summary']
                columnsList = summary['columns']
                self.assertEqual(colNumberMax+1, len(columnsList), 
                    msg="generated %s cols (including output).  summary has %s columns" % (colNumberMax+1, len(columnsList)))

                for columns in columnsList:
                    N = columns['N']
                    # self.assertEqual(N, rowCount)
                    name = columns['name']
                    stype = columns['type']

                    histogram = columns['histogram']
                    bin_size = histogram['bin_size']
                    bin_names = histogram['bin_names']
                    bins = histogram['bins']
                    nbins = histogram['bins']

                    # definitely not enums
                    zeros = columns['zeros']
                    na = columns['na']
                    smax = columns['max']
                    smin = columns['min']
                    mean = columns['mean']
                    sigma = columns['sigma']

                    # a single 1 in the last col
                    if name == "V" + str(colNumberMax): # h2o puts a "V" prefix
                        synZeros = num_rows - 1
                        synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count)
                        synMean = 1.0/num_rows # why does this need to be a 1 entry list
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    elif name == ("V1"):
                        # can reverse-engineer the # of zeroes, since data is always 1
                        synSum = synColSumDict[1] # could get the same sum for all ccols
                        synZeros = num_rows - synSum
                        synSigma = 0.50
                        synMean = (synSum + 0.0)/num_rows
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    else:
                        synZeros = num_rows
                        synSigma = 0.0
                        synMean = 0.0
                        synMin = [0.0]
                        synMax = [0.0]

                    # print zeros, synZeros
                    self.assertAlmostEqual(float(mean), synMean, places=6,
                        msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0))

                    # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                    self.assertEqual(smin, synMin,
                        msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin))

                    # reverse engineered the number of zeroes, knowing data was always 1 if present?
                    if name == "V65536" or name == "V65537":
                        print "columns around possible zeros mismatch:", h2o.dump_json(columns)

                    self.assertEqual(zeros, synZeros,
                        msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros))

                    self.assertEqual(stype, 'number',
                        msg='col %s type %s is not equal to %s' % (name, stype, 'number'))

                    # our random generation will have some variance for col 1. so just check to 2 places
                    if synSigma:
                        self.assertAlmostEqual(float(sigma), synSigma, delta=0.03,
                            msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma))

                    if CHECK_MAX:
                        self.assertEqual(smax, synMax,
                            msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax))

                    self.assertEqual(0, na,
                        msg='col %s num_missing_values %d should be 0' % (name, na))
コード例 #43
0
    def test_hdfs_cdh5_fvec(self):
        h2o.beta_features = True
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            ("covtype.169x.data", 600),
            ("prostate_long_1G.csv", 600),
            ("airlines_all.csv", 900),
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        print h2o.dump_json(d)
        d = h2i.import_only(path="datasets/*", schema='hdfs', timeoutSecs=1000)
        print h2o.dump_json(d)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time(
            ) - start, 'secs'

            start = time.time()
            print "Saving", csvFilename, 'to HDFS'
            print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
            csvPathname = "tmp2/a%s.csv" % trial
            path = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/" + csvPathname
            h2o.nodes[0].export_files(src_key=hex_key,
                                      path=path,
                                      force=1,
                                      timeoutSecs=timeoutSecs)
            print "export_files of", hex_key, "to", path, "took", time.time(
            ) - start, 'secs'
            trial += 1

            print "Re-Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a2.hex"
            time.sleep(2)
            d = h2i.import_only(path=csvPathname,
                                schema='hdfs',
                                timeoutSecs=1000)
            print h2o.dump_json(d)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=1000)
            print "hdfs re-parse of", csvPathname, "took", time.time(
            ) - start, 'secs'
コード例 #44
0
    def test_kmeans_iris_fvec(self):
        csvFilename = 'iris.csv'
        csvPathname = 'iris/' + csvFilename

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key)

        k = 3
        ignored_cols = 'C5'
        for trial in range(3):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'ignored_cols': ignored_cols,  # ignore the output
                'k': k,
                'max_iter': 25,
                'initialization': 'Furthest',
                'destination_key': 'iris.hex',
                'seed': 0,
            }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                # if ignored_cols isn't used
                # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) ,
                # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) ,
                # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) ,
                ([
                    5.005999999999999, 3.4180000000000006, 1.464,
                    0.2439999999999999
                ], 50, 15.240400000000003),
                ([
                    5.901612903225807, 2.748387096774194, 4.393548387096775,
                    1.4338709677419357
                ], 62, 39.82096774193549),
                ([
                    6.8500000000000005, 3.073684210526315, 5.742105263157894,
                    2.0710526315789473
                ], 38, 23.87947368421053),
            ]

            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)

            gs = h2o.nodes[0].gap_statistic(source=hex_key,
                                            ignored_cols=ignored_cols,
                                            k_max=k + 1)
            print "gap_statistic:", h2o.dump_json(gs)

            k_best = gs['gap_model']['k_best']
            self.assertTrue(k_best != 0,
                            msg="k_best shouldn't be 0: %s" % k_best)
コード例 #45
0
    def test_c5_KMeans_sphere_h1m(self):
        # a kludge
        h2o.setup_benchmark_log()

        if DO_REAL:
            csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        else:
            csvFilename = 'syn_sphere_gen_h1m.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'], numRows=numRows, numCols=numCols, 
                timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)


            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 30,
                # 'normalize': 1,
                'normalize': 0, # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
コード例 #46
0
ファイル: h2o_cmd.py プロジェクト: yangls06/h2o
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
    if not summaryResult:
        raise Exception("summaryResult is empty for infoFromSummary")

    summaries = summaryResult['summaries']
    # what if we didn't get the full # of cols in this summary view?
    # I guess the test should deal with that
    if 1 == 0 and numCols and (len(summaries) != numCols):
        raise Exception("Expected numCols: %s cols in summary. Got %s" %
                        (numCols, len(summaries)))

    for column in summaries:
        colname = column['colname']
        coltype = column['type']
        nacnt = column['nacnt']
        stats = column['stats']
        stattype = stats['type']
        h2o_exec.checkForBadFP(
            nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype))

        if stattype == 'Enum':
            cardinality = stats['cardinality']
            h2o_exec.checkForBadFP(
                cardinality, 'cardinality for colname: %s stattype: %s' %
                (colname, stattype))

        else:
            mean = stats['mean']
            sd = stats['sd']
            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            pctile = stats['pctile']

            # check for NaN/Infinity in some of these
            # apparently we can get NaN in the mean for a numerica col with all NA?
            h2o_exec.checkForBadFP(mean,
                                   'mean for colname: %s stattype: %s' %
                                   (colname, stattype),
                                   nanOkay=True,
                                   infOkay=True)
            h2o_exec.checkForBadFP(sd,
                                   'sd for colname: %s stattype %s' %
                                   (colname, stattype),
                                   nanOkay=True,
                                   infOkay=True)
            h2o_exec.checkForBadFP(
                zeros,
                'zeros for colname: %s stattype %s' % (colname, stattype))

            if numRows and (nacnt == numRows):
                print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (
                    colname, stattype)
            else:
                if not mins:
                    print h2o.dump_json(column)
                    # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows))
                    print "Why is min[] empty for a %s col (%s) ? %s %s %s" % (
                        mins, stattype, colname, nacnt, numRows)
                if not maxs:
                    # this is failing on maprfs best buy...why? (va only?)
                    print h2o.dump_json(column)
                    # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows))
                    print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (
                        maxs, stattype, colname, nacnt, numRows)

        hstart = column['hstart']
        hstep = column['hstep']
        hbrk = column['hbrk']
        hcnt = column['hcnt']

        if not noPrint:
            print "\n\n************************"
            print "colname:", colname
            print "coltype:", coltype
            print "nacnt:", nacnt

            print "stattype:", stattype
            if stattype == 'Enum':
                print "cardinality:", cardinality
            else:
                print "mean:", mean
                print "sd:", sd
                print "zeros:", zeros
                print "mins:", mins
                print "maxs:", maxs
                print "pct:", pct
                print "pctile:", pctile

            # histogram stuff
            print "hstart:", hstart
            print "hstep:", hstep
            print "hbrk:", hbrk
            print "hcnt:", hcnt
コード例 #47
0
    def test_get_cloud(self):
        # Ask each node for jstack statistics. do it 100 times
        SLEEP_AFTER = False
        GET_CLOUD_ALL_NODES = True
        TRIALMAX = 25
        NODE = 1
        PRINT_GET_CLOUD = True
        eList = []
        xList = []
        sList = []
        for trial in range(TRIALMAX):
            print "Starting Trial", trial
            print "Just doing node[%s]" % NODE
            getCloudFirst = None
            for i,n in enumerate(h2o.nodes):
                if GET_CLOUD_ALL_NODES or i==NODE: # just track times on 0
                    # we just want the string
                    start = time.time()
                    getCloud = n.get_cloud()
                    elapsed = int(1000 * (time.time() - start)) # milliseconds
                    print "get_cloud completes to node", i, "in", "%s"  % elapsed, "millisecs"
                    getCloudString = json.dumps(getCloud)

                    if PRINT_GET_CLOUD:
                        print h2o.dump_json(getCloud)
                
                    h2o.verboseprint(json.dumps(getCloud,indent=2))

                    if i==NODE: # just track times on 0
                        sList.append(len(getCloudString))
                        xList.append(trial)
                        eList.append(elapsed)

                    if SLEEP_AFTER:
                        delay = 1
                        print "Sleeping for", delay, "sec"
                        time.sleep(delay)

        if h2o.python_username=='kevin':
            import pylab as plt
            if eList:
                print "xList", xList
                print "eList", eList
                print "sList", sList

                plt.figure()
                plt.plot (xList, eList)
                plt.xlabel('trial')
                plt.ylabel('get_cloud completion latency (millisecs)')
                plt.title('Back to Back get_cloud requests to node['+str(NODE)+']')
                plt.draw()

                plt.figure()
                plt.plot (xList, sList)
                plt.xlabel('trial')
                plt.ylabel('node['+str(NODE)+'] get_cloud response string length')
                plt.title('Back to Back get_cloud requests to node['+str(NODE)+']')
                plt.title('Back to Back get_cloud')
                plt.draw()

                plt.show()
コード例 #48
0
ファイル: h2o_glm.py プロジェクト: vkuznet/h2o
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    if h2o.beta_features:
        GLMModel = glm['glm_model']
    else:
        GLMModel = glm['GLMModel']

    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm))
    

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    if h2o.beta_features:
        GLMParams = GLMModel['glm']
    else:
        GLMParams = GLMModel["GLMParams"]

    family = GLMParams["family"]

    if h2o.beta_features:
        # number of submodels = number of lambda
        # min of 2. lambda_max is first
        submodels = GLMModel['submodels']
        lambdas = GLMModel['lambdas']
        # since all our tests?? only use one lambda, the best_lamda_idx should = 1
        best_lambda_idx = GLMModel['best_lambda_idx']
        print "best_lambda_idx:", best_lambda_idx
        lambda_max = GLMModel['lambda_max']
        print "lambda_max:", lambda_max

        # currently lambda_max is not set by tomas. ..i.e.not valid
        if 1==0 and lambda_max <= lambdas[best_lambda_idx]:
            raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, lambdas[best_lambda_idx]))

        # submodels0 = submodels[0]
        # submodels1 = submodels[-1] # hackery to make it work when there's just one

        if (best_lambda_idx >= len(lambdas)) or (best_lambda_idx < 0):
            raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(lambdas)))

        if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
            raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))

        submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
        iterations = submodels1['iteration']

    else:
        iterations = GLMModel['iterations']

    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if h2o.beta_features:
        if 'validation' not in submodels1:
            raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1))
        validationsList = submodels1['validation']
        validations = validationsList
        
    else:
        # pop the first validation from the list
        if 'validations' not in GLMModel:
            raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))
        validationsList = GLMModel['validations']
        # don't want to modify validationsList in case someone else looks at it
        validations = validationsList[0]

    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    # not checked in v2?
    if not h2o.beta_features:
        if not 'xval_models' in validations:
            if n_folds > 1:
                raise Exception("No cross validation models returned. Asked for "+n_folds)
        else:
            xval_models = validations['xval_models']
            if n_folds and n_folds > 1:
                if len(xval_models) != n_folds:
                    raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)
            else:
                # should be default 10?
                if len(xval_models) != 10:
                    raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")

    if h2o.beta_features:
        print "GLMModel/validations"        
        validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
        validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])        
        print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
        print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    else:
        print "GLMModel/validations"
        validations['err'] = h2o_util.cleanseInfNan(validations['err'])
        validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
        validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
        print "%15s %s" % ("err:\t", validations['err'])
        print "%15s %s" % ("nullDev:\t", validations['nullDev'])
        print "%15s %s" % ("resDev:\t", validations['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        if h2o.beta_features:
            best_threshold = validations['best_threshold']
            thresholds = validations['thresholds']
            print "%15s %s" % ("best_threshold:\t", best_threshold)

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
                
            assert best_index!=None, "%s %s" % (best_threshold, thresholds)
            print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

            # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
            submodels = glm['glm_model']['submodels']
            cms = submodels[0]['validation']['_cms']
            assert best_index<len(cms), "%s %s" % (best_index, len(cms))
            # if we want 0.5..rounds to int
            # mid = len(cms)/2
            # cm = cms[mid]
            cm = cms[best_index]

            print "cm:", h2o.dump_json(cm['_arr'])
            predErr = cm['_predErr']
            classErr = cm['_classErr']
            # compare to predErr
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
            print "predErr:", predErr
            print "calculated pctWrong from cm:", pctWrong
            print "classErr:", classErr

            # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])
        else:
            print "%15s %s" % ("threshold:\t", validations['threshold'])


    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    if not h2o.beta_features:
        if math.isnan(validations['err']):
            emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
            raise Exception(emsg)

        if math.isnan(validations['resDev']):
            emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
            raise Exception(emsg)

        # legal?
        if math.isnan(validations['nullDev']):
            pass

    # get a copy, so we don't destroy the original when we pop the intercept
    if h2o.beta_features:
        coefficients_names = GLMModel['coefficients_names']
        idxs = submodels1['idxs']
        column_names = coefficients_names

        # always check both normalized and normal coefficients
        norm_beta = submodels1['norm_beta']
        if norm_beta and len(column_names)!=len(norm_beta):
            print len(column_names), len(norm_beta)
            raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta))

        beta = submodels1['beta']
        if len(column_names)!=len(beta):
            print len(column_names), len(beta)
            raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta))


        # test wants to use normalized?
        if doNormalized:
            beta_used = norm_beta
        else:
            beta_used = beta

        coefficients = {}
        # create a dictionary with name, beta (including intercept) just like v1

        for n,b in zip(column_names, beta_used):
            coefficients[n] = b

        print  "coefficients:", coefficients
        print  "beta:", beta
        print  "norm_beta:", norm_beta

        print "intercept demapping info:", \
            "column_names[-i]:", column_names[-1], \
            "idxs[-1]:", idxs[-1], \
            "coefficients_names[[idxs[-1]]:", coefficients_names[idxs[-1]], \
            "beta_used[-1]:", beta_used[-1], \
            "coefficients['Intercept']", coefficients['Intercept']

        # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names
        for i in idxs:
            if beta_used[i]==0.0:
                raise Exception("idxs shouldn't point to any 0 coefficients i: %s beta_used[i]:" (i, beta_used[i]))

        intercept = coefficients.pop('Intercept', None)

        # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
        # the last one shoudl be 'Intercept' ?
        column_names.pop()

    else:
        if doNormalized:
            coefficients = GLMModel['normalized_coefficients'].copy()
        else:
            coefficients = GLMModel['coefficients'].copy()
        column_names = GLMModel['column_names']
        # get the intercept out of there into it's own dictionary
        intercept = coefficients.pop('Intercept', None)
        print "First intercept:", intercept

    # have to skip the output col! get it from kwargs
    # better always be there!
    if h2o.beta_features:
        y = kwargs['response']
    else:
        y = kwargs['y']


    # the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
    # from the response
    # Tomas created 'column_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # column_names is input only now..same for header or no header, or expanded enums
    for c in column_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(column_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept"
                ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
            ))

    if h2o.beta_features:
        print "submodels1, run_time (milliseconds):", submodels1['run_time']
    else:

        print "GLMModel model time (milliseconds):", GLMModel['model_time']
        print "GLMModel validation time (milliseconds):", validations['val_time']
        print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return (warnings, cList, intercept)
コード例 #49
0
    def test_anomaly_uniform_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, COLS, 'x.hex', 1, 20000),
            (ROWS, COLS, 'x.hex', -5000, 0),
            (ROWS, COLS, 'x.hex', -100000, 100000),
            (ROWS, COLS, 'x.hex', -1, 1),
            (ROWS, COLS, 'A.hex', 1, 100),
            (ROWS, COLS, 'A.hex', -99, 99),
            (ROWS, COLS, 'B.hex', 1, 10000),
            (ROWS, COLS, 'B.hex', -100, 100),
            (ROWS, COLS, 'C.hex', 1, 100000),
            (ROWS, COLS, 'C.hex', -101, 101),
        ]

        trial = 1
        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "numRows:", numRows, "numCols:", numCols

            model_key = "m.hex"
            kwargs = {
                'ignored_cols': None,
                'response': numCols - 1,
                'classification': 0,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117',
                'adaptive_rate': 0,
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                # 'loss'                         : 'CrossEntropy',
                'max_w2': 15,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 2.0,
                'destination_key': model_key,
                # 'validation'                   : None,
                'score_interval': 10000,
                'autoencoder': 1,
            }

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "neural net end. took", time.time() - start, "seconds"

            kwargs = {
                'destination_key': "a.hex",
                'source': parseResult['destination_key'],
                'dl_autoencoder_model': model_key,
                'thresh': 1.0
            }

            anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs)
            inspect = h2o_cmd.runInspect(None, "a.hex")
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "anomaly: numRows:", numRows, "numCols:", numCols
            self.assertEqual(numCols, 1)
            # twice as many rows because of NA injection
            self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO))

            # first col has the anomaly info. other cols are the same as orig data
            aSummary = h2o_cmd.runSummary(key='a.hex', cols=0)
            h2o_cmd.infoFromSummary(aSummary)

            print "anomaly:", h2o.dump_json(anomaly)
            trial += 1
            h2i.delete_keys_at_all_nodes()
コード例 #50
0
ファイル: h2o_import.py プロジェクト: jayfans3/h2o
def import_parse(node=None,
                 schema='local',
                 bucket=None,
                 path=None,
                 src_key=None,
                 hex_key=None,
                 timeoutSecs=30,
                 retryDelaySecs=0.5,
                 initialDelaySecs=0.5,
                 pollTimeoutSecs=180,
                 noise=None,
                 benchmarkLogging=None,
                 noPoll=False,
                 doSummary=True,
                 noPrint=True,
                 **kwargs):

    ## if h2o.beta_features:
    ##     print "HACK: temporarily disabling Summary always in v2 import_parse"
    ##     doSummary = False

    if not node: node = h2o.nodes[0]

    (importResult,
     importPattern) = import_only(node, schema, bucket, path, timeoutSecs,
                                  retryDelaySecs, initialDelaySecs,
                                  pollTimeoutSecs, noise, benchmarkLogging,
                                  noPoll, doSummary, src_key, **kwargs)

    h2o.verboseprint("importPattern:", importPattern)
    h2o.verboseprint("importResult", h2o.dump_json(importResult))

    parseResult = parse_only(node, importPattern, hex_key, timeoutSecs,
                             retryDelaySecs, initialDelaySecs, pollTimeoutSecs,
                             noise, benchmarkLogging, noPoll, **kwargs)
    h2o.verboseprint("parseResult:", h2o.dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        h2o.check_sandbox_for_errors()
        inspect = node.inspect(parseResult['destination_key'],
                               timeoutSecs=timeoutSecs)
        if h2o.beta_features:
            numRows = inspect['numRows']
            numCols = inspect['numCols']
        else:
            numRows = inspect['num_rows']
            numCols = inspect['num_cols']

        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        node.summary_page(parseResult['destination_key'],
                          timeoutSecs=timeoutSecs,
                          noPrint=noPrint,
                          numRows=numRows,
                          numCols=numCols)
        # for now, don't worry about error isolating summary
    else:
        # isolate a parse from the next thing
        h2o.check_sandbox_for_errors()

    return parseResult
コード例 #51
0
ファイル: test_summary2_exp.py プロジェクト: zhuyuecai/h2o
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)),
            (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None,
                                                  None)),
            (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)),
            (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None,
                                           None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = False
            summary1Result = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("Summary1 summary1Result:",
                             h2o.dump_json(summary1Result))
            percentiles1 = summary1Result['summary']['columns'][0][
                'percentiles']
            thresholds1 = percentiles1['thresholds']
            values1 = percentiles1['values']

            print "Summary1 thresholds", h2o_util.twoDecimals(thresholds1)
            print "Summary1 values", h2o_util.twoDecimals(values1)

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:",
                             h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )
コード例 #52
0
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60))
            elapsed = time.time() - start
            print "parse end on ", hex_key, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \

if __name__ == '__main__':
    h2o.unit_main()
コード例 #53
0
    def test_RF(self):
        paramsTrainRF = {
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3,
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1,
         totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        # self.assertEqual(4.29, classification_error1)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82],
                         classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2,
         totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        # self.assertEqual(4.29, classification_error2)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82],
                         classErrorPctList1)
        self.assertEqual(58101, totalScores2)

        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception(
                "Too many diffs in JsonDiff sorted vs non-sorted %s" %
                len(df.difference))
コード例 #54
0
ファイル: test_exec2_log_like_R.py プロジェクト: yangls06/h2o
    def test_exec2_log_like_R(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        # csvPathname = '1B/reals_100000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1_15f.data'
        # csvPathname = '1B/reals_1B_15f.data'
        # csvPathname = '1B/reals_100M_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=3000,
                                       retryDelaySecs=2,
                                       doSummary=False)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        xList = []
        eList = []
        fList = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=300)
        for trial in range(300):
            for execExpr in exprList:
                # put the trial number into the temp for uniqueness
                execExpr = re.sub('Last.value', 'Last.value%s' % trial,
                                  execExpr)
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr,
                                                   resultKey=None,
                                                   timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'
                c = h2o.nodes[0].get_cloud()
                c = c['nodes']

                # print (h2o.dump_json(c))
                k = [i['num_keys'] for i in c]
                v = [i['value_size_bytes'] for i in c]

                print "keys: %s" % " ".join(map(str, k))
                print "value_size_bytes: %s" % " ".join(map(str, v))

                # print "result:", result
                if DO_ORIG:
                    if 'r1' in execExpr:
                        xList.append(trial)
                        eList.append(execTime)
                    if 'log' in execExpr:
                        fList.append(execTime)
                else:
                    xList.append(trial)
                    eList.append(execTime)
                    fList.append(execTime)

        h2o.check_sandbox_for_errors()
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            if DO_ORIG:
                eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]'
                fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)'
            else:
                eLabel = 'time: Last.value.3 = r2+1'
                fLabel = 'time: Last.value.3 = r2+1'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
コード例 #55
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
コード例 #56
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(
                y=0, key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"

            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt",
                         "w")
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
コード例 #57
0
ファイル: test_ddply_plot.py プロジェクト: yangls06/h2o
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (1000000, 5, 'cD', 0, 10, 30),
            (1000000, 5, 'cD', 0, 20, 30),
            (1000000, 5, 'cD', 0, 30, 30),
            (1000000, 5, 'cD', 0, 40, 30),
            (1000000, 5, 'cD', 0, 50, 30),
            (1000000, 5, 'cD', 0, 70, 30),
            (1000000, 5, 'cD', 0, 100, 30),
            (1000000, 5, 'cD', 0, 130, 30),
            (1000000, 5, 'cD', 0, 160, 30),
            # (1000000, 5, 'cD', 0, 320, 30),
            # starts to fail here. too many groups?
            # (1000000, 5, 'cD', 0, 640, 30),
            # (1000000, 5, 'cD', 0, 1280, 30),
        ]

        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt,
             timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname, "with range", (maxInt -
                                                                 minInt) + 1
            write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt,
                              SEEDPERFILE)

            # PARSE train****************************************
            hexKey = 'r.hex'
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey)

            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0],
                              execExpr,
                              resultKey=resultKey,
                              timeoutSecs=60)

            # do it twice..to get the optimal cached delay for time?
            execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=60)
            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed

            execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            groups = execResult['num_rows']
            maxExpectedGroups = ((maxInt - minInt) + 1)**2
            h2o_util.assertApproxEqual(
                groups,
                maxExpectedGroups,
                rel=0.2,
                msg="groups %s isn't close to expected amount %s" %
                (groups, maxExpectedGroups))

            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed
            print "execResult", h2o.dump_json(execResult)

            # should be same answer in both cases
            execExpr = "d=sum(a1!=a2)==0"
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            print "execResult", h2o.dump_json(execResult)
            self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result)

            # xList.append(ntrees)
            trial += 1
            # this is the biggest it might be ..depends on the random combinations
            # groups = ((maxInt - minInt) + 1) ** 2
            xList.append(groups)
            eList.append(ddplyElapsed)
            fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
コード例 #58
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,
             ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,
             ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0,
             ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]),
            (ROWS, 1, 'A.hex', 1.0, 100.0,
             ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,
             ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]),
            (ROWS, 1, 'B.hex', 1.0, 10000.0,
             ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
            (ROWS, 1, 'C.hex', 1.0, 100000.0,
             ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount,
                                                       colCount, expectedMin,
                                                       expectedMax,
                                                       SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange / (MAX_QBINS - 2)
            maxDelta = 1 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight?
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=column['colname'],
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2,
                                       interpolation_type=7)  # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       rel=.00001,
                                       msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       rel=.00001,
                                       msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance,
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance,
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance,
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                )

            h2o.nodes[0].remove_all_keys()
コード例 #59
0
    def test_GBM_regression_rand2(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {
                    'response': 'C55', 
                    # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 
                    'ntrees': 2, 
                    'classification': 0,
                    'validation': testKey,
                    }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                h2o.beta_features = True
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                print "gbmTrainView:", h2o.dump_json(gbmTrainView)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                # for regression, the cms are all null, so don't print

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=testKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
コード例 #60
0
    def test_hdfs_hdp2_1(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 1200),
            ("prostate_long_1G.csv", 200),
            ("airlines_all.csv", 1200),
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename

            # Do a simple typeahead check on the directory
            # typeaheadResult 2: {
            #   "__meta": {
            #     "schema_name": "TypeaheadV2",
            #     "schema_type": "Iced",
            #     "schema_version": 2
            #   },
            #   "limit": 2,
            #   "matches": [
            #     "hdfs://172.16.2.186/datasets/15Mx2.2k.csv",
            #     "hdfs://172.16.2.186/datasets/1Mx2.2k.NAs.csv"
            #   ],
            #   "src": "hdfs://172.16.2.186/datasets/"
            # }

            typeaheadPath = "hdfs://" + h2o.nodes[
                0].hdfs_name_node + "/datasets/"
            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=2)
            print "typeaheadResult 2:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) == 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=0)
            print "typeaheadResult 0:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=None)
            print "typeaheadResult 0:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=-1)
            print "typeaheadResult -1:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time(
            ) - start, 'secs'
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            if DO_EXPORT:
                start = time.time()
                print "Saving", csvFilename, 'to HDFS'
                print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
                print "Unique per-user to avoid permission issues"
                username = getpass.getuser()
                csvPathname = "tmp2/a%s.%s.csv" % (trial, username)
                # reuse the file name to avoid running out of space
                csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files',
                                                   username)

                path = "hdfs://" + h2o.nodes[
                    0].hdfs_name_node + "/" + csvPathname
                h2o.nodes[0].export_files(src_key=hex_key,
                                          path=path,
                                          force=1,
                                          timeoutSecs=timeoutSecs)
                print "export_files of", hex_key, "to", path, "took", time.time(
                ) - start, 'secs'
                trial += 1

                print "Re-Loading", csvFilename, 'from HDFS'
                start = time.time()
                hex_key = "a2.hex"
                time.sleep(2)
                d = h2i.import_only(path=csvPathname,
                                    schema='hdfs',
                                    timeoutSecs=1000)
                print h2o.dump_json(d)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='hdfs',
                                               hex_key=hex_key,
                                               timeoutSecs=1000)
                print "hdfs re-parse of", csvPathname, "took", time.time(
                ) - start, 'secs'