Beispiel #1
0
def exec_expr(node,
              execExpr,
              resultKey="Result.hex",
              timeoutSecs=10,
              ignoreH2oError=False):
    start = time.time()
    # FIX! Exec has 'escape_nan' arg now. should we test?
    # 5/14/13 removed escape_nan=0
    resultExec = h2o_cmd.runExecOnly(node,
                                     expression=execExpr,
                                     timeoutSecs=timeoutSecs,
                                     ignoreH2oError=ignoreH2oError)
    h2o.verboseprint(resultExec)
    h2o.verboseprint('exec took', time.time() - start, 'seconds')
    ### print 'exec took', time.time() - start, 'seconds'

    h2o.verboseprint("\nfirst look at the default Result key")
    # new offset=-1 to get the metadata?
    defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1)
    checkScalarResult(defaultInspectM1, "Result.hex")

    h2o.verboseprint("\nNow look at the assigned " + resultKey + " key")
    resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1)
    min_value = checkScalarResult(resultInspectM1, resultKey)

    return resultInspectM1, min_value
Beispiel #2
0
def exec_expr(node, execExpr, resultKey="Result.hex", timeoutSecs=10):
    start = time.time()
    # FIX! Exec has 'escape_nan' arg now. should we test?
    resultExec = h2o_cmd.runExecOnly(node, expression=execExpr, timeoutSecs=timeoutSecs, escape_nan=0)
    h2o.verboseprint(resultExec)
    h2o.verboseprint('exec took', time.time() - start, 'seconds')
    ### print 'exec took', time.time() - start, 'seconds'

    h2o.verboseprint("\nfirst look at the default Result key")
    # new offset=-1 to get the metadata?
    defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1)
    checkScalarResult(defaultInspectM1, "Result.hex")

    h2o.verboseprint("\nNow look at the assigned " + resultKey + " key")
    resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1)
    min_value = checkScalarResult(resultInspectM1, resultKey)

    return resultInspectM1, min_value
Beispiel #3
0
    def test_loop_random_exec_covtype(self):
        lenNodes = len(h2o.nodes)
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        key2 = 'c.hex'
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2,
                                     10)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        # for trial in range(53):
        trial = 0
        while (trial < 100):
            for exprTemplate in exprList:
                trial = trial + 1
                n = trial
                colX = random.randint(1, 54)
                row = random.randint(1, 400000)

                execExpr = exprTemplate
                execExpr = re.sub('<col1>', str(colX), execExpr)
                execExpr = re.sub('<col2>', str(colX + 1), execExpr)
                execExpr = re.sub('<n>', str(n), execExpr)
                execExpr = re.sub('<row>', str(row), execExpr)
                execExpr = re.sub('<keyX>', str(key2), execExpr)

                # pick a random node to execute it on
                randNode = random.randint(0, lenNodes - 1)
                print "\nexecExpr:", execExpr, "on node", randNode

                start = time.time()
                resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode],
                                                 expression=execExpr,
                                                 timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(resultExec))
                # print(h2o.dump_json(resultExec))

                # FIX! race conditions. If json is done, does that mean you can inspect it??
                # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist
                if trial > 1:
                    inspectMe = random.choice(inspectList)
                    resultInspect = h2o.nodes[0].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[1].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[2].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                # FIX! if we race the browser doing the exec too..it shouldn't be a problem?
                # might be a bug?

                # WARNING! we can't browse the Exec url history, since that will
                # cause the Exec to execute again thru the browser..i.e. it has side effects
                # just look at the last inspect, which should be the resultInspect!
                # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                h2b.browseJsonHistoryAsUrlLastMatch("Exec")
                # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result"
                # webbrowser.open_new_tab(url)

                # FIX! I suppose we have the problem of stdout/stderr not having flushed?
                # should hook in some way of flushing the remote node stdout/stderr
                h2o.check_sandbox_for_errors()
                print "exec end on ", "covtype.data", 'took', time.time(
                ) - start, 'seconds'
                print "Trial #", trial, "completed\n"
Beispiel #4
0
    def test_rf_covtype20x(self):
        importFolderPath = '/home/0xdiag/datasets/standard'

        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        csvFilenameTrain = 'covtype20x.data'
        key2 = 'covtype20x.data.A.hex'
        parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, key2=key2, timeoutSecs=500)
        print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time']
        inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key'])
        dataKeyTrain = parseKeyTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        csvFilenameTest = 'covtype20x.data'
        key2 = 'covtype20x.data.B.hex'
        parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, key2=key2, timeoutSecs=500)
        print csvFilenameTest, 'parse time:', parseKeyTest['response']['time']
        print "Parse result['destination_key']:", parseKeyTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key'])
        dataKeyTest = parseKeyTest['destination_key']
        dataKeyTest2 = 'covtype20x.data.C.hex'

        print "Parse end", dataKeyTest
        
        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        resultExec = h2o_cmd.runExecOnly(expression=execExpr, timeoutSecs=15)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?"
        params = {
            'ntree': 6, 
            'parallel': 1, 
            'out_of_bag_error_estimate': 0, 
            'no_confusion_matrix': 1,
            'model_key': 'RF_model'
        }

        colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow
        timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5)

        start = time.time()
        rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=500, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['model_key']
        ntree = kwargs['ntree']
        start = time.time()
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
    def test_loop_random_exec_covtype(self):
        lenNodes = len(h2o.nodes)
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        key2 = 'c.hex'
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2, 10)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        # for trial in range(53):
        trial = 0
        while (trial < 100):
            for exprTemplate in exprList:
                trial = trial + 1
                n = trial
                colX = random.randint(1,54)
                row = random.randint(1,400000)

                execExpr = exprTemplate
                execExpr = re.sub('<col1>',str(colX),execExpr)
                execExpr = re.sub('<col2>',str(colX+1),execExpr)
                execExpr = re.sub('<n>',str(n),execExpr)
                execExpr = re.sub('<row>',str(row),execExpr)
                execExpr = re.sub('<keyX>',str(key2),execExpr)

                # pick a random node to execute it on
                randNode = random.randint(0,lenNodes-1)
                print "\nexecExpr:", execExpr, "on node", randNode

                start = time.time()
                resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode], 
                    expression=execExpr, timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(resultExec))
                # print(h2o.dump_json(resultExec))

                # FIX! race conditions. If json is done, does that mean you can inspect it??
                # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist
                if trial > 1:
                    inspectMe = random.choice(inspectList)
                    resultInspect = h2o.nodes[0].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[1].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[2].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                # FIX! if we race the browser doing the exec too..it shouldn't be a problem?
                # might be a bug?

                # WARNING! we can't browse the Exec url history, since that will 
                # cause the Exec to execute again thru the browser..i.e. it has side effects
                # just look at the last inspect, which should be the resultInspect!
                # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                h2b.browseJsonHistoryAsUrlLastMatch("Exec")
                # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result"
                # webbrowser.open_new_tab(url)

                # FIX! I suppose we have the problem of stdout/stderr not having flushed?
                # should hook in some way of flushing the remote node stdout/stderr
                h2o.check_sandbox_for_errors()
                print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
                print "Trial #", trial, "completed\n"