Example #1
0
    def test_delete_all_keys(self):
        # FIX! should have some model keys in here too, from RF etc.
        importFolderPath = 'standard'
        timeoutSecs = 500

        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        for trial in range(3):
            for csvFilename in csvFilenameList:
                csvPathname = importFolderPath + "/" + csvFilename
                start = time.time()
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
                elapsed = time.time() - start
                print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
                print csvFilename, 'H2O reports parse time:', parseResult['response']['time']

                print "Parse result['destination_key']:", parseResult['destination_key']
                print "\n" + csvFilename

                print "Delete all keys"
                h2o.nodes[0].remove_all_keys()
                print "This shouldn't see any keys"
                h2i.delete_keys_at_all_nodes()



            print "\nTrial", trial, "completed\n"
    def test_delete_all_keys(self):
        # FIX! should have some model keys in here too, from RF etc.
        importFolderPath = 'standard'
        timeoutSecs = 500

        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        for trial in range(2):
            for csvFilename in csvFilenameList:
                csvPathname = importFolderPath + "/" + csvFilename

                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                               path=csvPathname,
                                               timeoutSecs=500)
                pA = h2o_cmd.ParseObj(parseResult)
                iA = h2o_cmd.InspectObj(pA.parse_key)
                parse_key = pA.parse_key
                numRows = iA.numRows
                numCols = iA.numCols
                labelList = iA.labelList

                h2i.delete_keys_at_all_nodes()
                print "Delete all keys. Shouldn't be any more?"
                h2o.nodes[0].remove_all_keys()

            print "\nTrial", trial, "completed\n"
Example #3
0
    def test_delete_all_keys(self):
        h2o.beta_features = True
        # FIX! should have some model keys in here too, from RF etc.
        importFolderPath = 'standard'
        timeoutSecs = 500

        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        for trial in range(3):
            for csvFilename in csvFilenameList:
                csvPathname = importFolderPath + "/" + csvFilename
                start = time.time()
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                               path=csvPathname,
                                               timeoutSecs=500)
                elapsed = time.time() - start
                print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed * 100) / timeoutSecs), "\n"

                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                print "\n" + csvFilename

                print "Delete all keys"
                h2o.nodes[0].remove_all_keys()
                print "This shouldn't see any keys"
                h2i.delete_keys_at_all_nodes()

            print "\nTrial", trial, "completed\n"
Example #4
0
    def test_from_import_fvec(self):
        csvFilenameAll = [
            ("covtype.data", 500),
            # ("covtype20x.data", 1000),
            ]

        for (csvFilename, timeoutSecs) in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult['destination_key'])

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
            # h2o_cmd.infoFromSummary(summaryResult)

            trees = 2
            start = time.time()
            rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
                trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, 
                    trees, classification_error, classErrorPctList, totalScores)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
Example #5
0
    def test_delete_all_keys(self):
        # FIX! should have some model keys in here too, from RF etc.
        importFolderPath = 'standard'
        timeoutSecs = 500

        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        for trial in range(2):
            for csvFilename in csvFilenameList:
                csvPathname = importFolderPath + "/" + csvFilename

                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
                pA = h2o_cmd.ParseObj(parseResult)
                iA = h2o_cmd.InspectObj(pA.parse_key)
                parse_key = pA.parse_key
                numRows = iA.numRows
                numCols = iA.numCols
                labelList = iA.labelList

                h2i.delete_keys_at_all_nodes()
                print "Delete all keys. Shouldn't be any more?"
                h2o.nodes[0].remove_all_keys()

            print "\nTrial", trial, "completed\n"
Example #6
0
    def test_from_import(self):
        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            ]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='put',
                hex_key=hex_key, timeoutSecs=500)
            if not h2o.beta_features:
                print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            if not h2o.beta_features:
                RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (10, 100000, 'cA', 200, 200),
            (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse:", parseResult['destination_key'], "took", time.time(
            ) - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            # if not h2o.browse_disable:
            #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            #    time.sleep(5)
            h2i.delete_keys_at_all_nodes()
    def test_parse_100k_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 1000, 'cA', 200, 200),
            (10, 2000, 'cA', 200, 200),
            (10, 4000, 'cA', 200, 200),
            (10, 8000, 'cA', 200, 200),
            (10, 9000, 'cA', 200, 200),
            (10, 10000, 'cA', 200, 200),
            (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            # does it blow up if it sets columnNames?
            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS)
            print "Parse:", csvFilename, "took", time.time() - start, "seconds"

        
            print "Skipping the row/cols check for now"
            if 1==1:
                start = time.time()
                inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=timeoutSecs2)
                print "Inspect:", hex_key, "took", time.time() - start, "seconds"
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)

                # should match # of cols in header or ??
                self.assertEqual(numCols, colCount,
                    "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
                self.assertEqual(numRows, rowCount,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (numRows, rowCount))

            print "Skipping the delete keys for now"
            if 1==0:
                # if not h2o.browse_disable:
                #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                #    time.sleep(5)
                h2i.delete_keys_at_all_nodes()
    def test_parse_100k_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 1000, 'cA', 200, 200),
            (10, 2000, 'cA', 200, 200),
            (10, 4000, 'cA', 200, 200),
            (10, 8000, 'cA', 200, 200),
            (10, 9000, 'cA', 200, 200),
            (10, 10000, 'cA', 200, 200),
            (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(
                path=csvPathname,
                schema='local',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                doSummary=False,
                columnNames=None,
                intermediateResults=DO_INTERMEDIATE_RESULTS)

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            print "Skipping the delete keys for now"
            if 1 == 0:
                # if not h2o.browse_disable:
                #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                #    time.sleep(5)
                h2i.delete_keys_at_all_nodes()
Example #10
0
    def tearDownClass(cls):
        # DON"T
        ### h2o.tear_down_cloud()

        # Instead: All tests should delete their keys..i.e. leave things clean for the next test
        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #11
0
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b

        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList:
        nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes.
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
Example #12
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # PARSE****************************************
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key,
                delete_on_done=DELETE_ON_DONE, 
                # importParentDir=IMPORT_PARENT_DIR,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # convert to binomial
            if DO_EXEC:
                execExpr="A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

                # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

            if DO_DELETE_MYSELF:
                h2o_import.delete_keys_at_all_nodes()

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Example #13
0
    def tearDownClass(cls):
        # DON"T
        ### h2o.tear_down_cloud()

        # Instead: All tests should delete their keys..i.e. leave things clean for the next test
        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
    def test_c9b_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            # passes 5, fails 15
            # for depth in [5,15,25,40]:
            for depth in [5,5,5,5,5]:
                params = {
                    'destination_key': "GBMKEY",
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': 10,
                    'max_depth': depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)

                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_c8_rf_airlines_hdfs(self):
        trainParseResult = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)
        
        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Example #16
0
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b
        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList: nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes. 
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
Example #17
0
    def tearDownClass(cls):
        print "tearDownClass"
        # DON"T
        ### h2o.tear_down_cloud()

        # this could fail too
        if h2o.nodes[0].delete_keys_at_teardown:
            start = time.time()
            h2i.delete_keys_at_all_nodes(timeoutSecs=300)
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #18
0
    def tearDownClass(cls):
        print "tearDownClass"
        # DON"T
        ### h2o.tear_down_cloud()

        # this could fail too
        if h2o.nodes[0].delete_keys_at_teardown:
            start = time.time()
            h2i.delete_keys_at_all_nodes(timeoutSecs=300)
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
    def test_parse_long_colnames(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 1000, 'cA', 200, 200),
            (10, 1100, 'cA', 200, 200),
            (10, 1200, 'cA', 200, 200),
            (10, 1300, 'cA', 200, 200),
            (10, 1400, 'cA', 200, 200),
            (10, 1500, 'cA', 200, 200),
            (10, 1600, 'cA', 200, 200),
            (10, 1700, 'cA', 200, 200),
            (10, 1800, 'cA', 200, 200),
            (10, 1900, 'cA', 200, 200),
            (10, 2000, 'cA', 200, 200),
            (10, 4000, 'cA', 200, 200),
            (10, 8000, 'cA', 200, 200),
            (10, 9000, 'cA', 200, 200),
            (10, 10000, 'cA', 200, 200),
            # (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, doSummary=False, column_names=None, intermediateResults=DO_INTERMEDIATE_RESULTS)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            print "Skipping the delete keys for now"
            if 1==0:
                # if not h2o.browse_disable:
                #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                #    time.sleep(5)
                h2i.delete_keys_at_all_nodes()
Example #20
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    "destination_key": "GBMKEY",
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": 10,
                    "max_depth": depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed",
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                # noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = (statMean["num_cpus"],)
                my_cpu_pct = (statMean["my_cpu_%"],)
                sys_cpu_pct = (statMean["sys_cpu_%"],)
                system_load = statMean["system_load"]
                # shouldn't need this?
                h2j.pollWaitJobs(
                    pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5
                )
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                # print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()

        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=900,
                                       doSummary=False)
        parse_time = time.time() - start
        print "parse took {0} sec".format(parse_time)
        start = time.time()

        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = {
            'ntrees': trees,
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name':
            'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
        }
        kwargs = paramsTrainRF.copy()
        start = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename,
            elapsed, trees, classification_error, classErrorPctList,
            totalScores)
        print "\n" + l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
Example #22
0
    def tearDownClass(cls):
        print "tearDownClass"
        # DON"T
        ### h2o.tear_down_cloud()

        # try to download the logs...may fail again!
        h2o.nodes[0].log_download()

        # this could fail too
        if h2o.nodes[0].delete_keys_at_teardown:
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #23
0
    def test_GBM_mnist_restart_many(self):
        importFolderPath = "mnist"
        csvFilename = "train.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        for trial in range(10):
            # PARSE train****************************************
            trainKey = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=importFolderPath + "/" +
                                           csvFilename,
                                           schema='put',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 10,
                'max_depth': 8,
                'min_rows': 1,
                'response':
                784,  # this dataset has the response in the last col (0-9 to check)
                # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
            }

            kwargs = params.copy()
            h2o.beta_features = True
            timeoutSecs = 1800
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                       noPoll=True,
                                       **kwargs)
            h2o.beta_features = False
            # if it fails, should happen within 8 secs
            time.sleep(8)
            h2j.cancelAllJobs()
            h2o.check_sandbox_for_errors()
            print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial

            if DO_DELETE_KEYS_AND_CAUSE_PROBLEM:
                h2i.delete_keys_at_all_nodes()
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (10, 100000, 'cA', 200, 200),
            (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
            ]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            # if not h2o.browse_disable:
            #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            #    time.sleep(5)
            h2i.delete_keys_at_all_nodes()
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()


        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start      = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', 
            path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False)
        parse_time = time.time() - start 
        print "parse took {0} sec".format(parse_time)
        start      = time.time()
        
        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start 
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = { 
            'ntrees': trees, 
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
            }
        kwargs   = paramsTrainRF.copy()
        start      = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed,
                trees, classification_error, classErrorPctList, totalScores)
        print "\n"+l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
Example #26
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4 * 3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600,
                         "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(
            h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!")
        h2p.blue_print(
            "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs"
            % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime < maxTime):  # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
    def test_GBM_mnist_restart_many(self):
        importFolderPath = "mnist"
        csvFilename = "train.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        for trial in range(10):
            # PARSE train****************************************
            trainKey = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = { 
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 10,
                'max_depth': 8,
                'min_rows': 1,
                'response': 784, # this dataset has the response in the last col (0-9 to check)
                # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
                }

            kwargs = params.copy()
            h2o.beta_features = True
            timeoutSecs = 1800
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            h2o.beta_features = False
            # if it fails, should happen within 8 secs
            time.sleep(8)
            h2j.cancelAllJobs()
            h2o.check_sandbox_for_errors()
            print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial

            if DO_DELETE_KEYS_AND_CAUSE_PROBLEM:
                h2i.delete_keys_at_all_nodes()
Example #28
0
File: cloud.py Project: 100star/h2o
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 
        h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #29
0
    def test_from_import(self):
        h2o.beta_features = True

        timeoutSecs = 500
        csvFilenameAll = ["covtype.data", "covtype20x.data"]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir
            hex_key = csvFilename + ".hex"
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path="standard/" + csvFilename,
                schema="local",
                hex_key=hex_key,
                timeoutSecs=500,
                doSummary=False,
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult["destination_key"])

            summaryResult = h2o_cmd.runSummary(key=parseResult["destination_key"])
            h2o_cmd.infoFromSummary(summaryResult)

            if not h2o.beta_features:
                RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
Example #30
0
    def test_from_import(self):
        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
        ]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path="standard/" + csvFilename,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=500)
            if not h2o.beta_features:
                print csvFilename, 'parse time:', parseResult['response'][
                    'time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            if not h2o.beta_features:
                RFview = h2o_cmd.runRF(trees=1,
                                       depth=25,
                                       parseResult=parseResult,
                                       timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
Example #31
0
    def test_c6_maprfs_fvec(self):
        h2o.beta_features = True
        print "\nLoad a list of files from maprfs, parse and do 1 RF tree"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # duplicate column header "A"
            # "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # h2o.nodes[0].use_maprfs = True
        # h2o.nodes[0].use_hdfs = False
        # h2o.nodes[0].hdfs_version = 'mapr3.0.1',
        # h2o.nodes[0].hdfs_name_node = '192.168.1.171:7222'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs)
            print "importResult:", h2o.dump_json(importResult)
            succeeded = importResult['files']
            fails = importResult['fails']

            if len(succeeded) < 1:
                raise Exception("Should have imported at least 1 key for %s" % csvPathname)

            # just do a search
            foundIt = None
            for f in succeeded:
                if csvPathname in f:
                    foundIt = f
                    break

            if not foundIt:
                raise Exception("Should have found %s in the imported keys for %s" % (importPattern, csvPathname))

            totalBytes = 0

            print "Loading", csvFilename, 'from maprfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs, pollTimeoutSecs=360,
                doSummary=True, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features)
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {
                    'ntrees': 1
                    }
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000,
                    benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features, **kwargs)
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            print "Deleting all keys, to make sure our parse times don't include spills"
            h2i.delete_keys_at_all_nodes()
            trial += 1
Example #32
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 2, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 100, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 1000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?
        
            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta


            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7,
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(qresult_iterations, 16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")


            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if DO_TRY_SCIPY and colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                print scipyCol, pctile[10]
                generate_scipy_comparison(csvPathnameFull, col=scipyCol,
                     # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single)
                    h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult)



            h2i.delete_keys_at_all_nodes()
    def test_c5_KMeans_sphere_26GB_fvec(self):
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = "syn_sphere15_gen_26GB.csv"
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                (
                    [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]
        else:
            expected = [
                (
                    [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300)
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summary = h2o_cmd.runSummary(
                key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300
            )
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "k": 15,
                "max_iter": 500,
                # 'normalize': 1,
                "normalize": 0,  # temp try
                "initialization": "Furthest",
                "destination_key": "junk.hex",
                # we get NaNs if whole col is NA
                "ignored_cols": "C1",
                "normalize": 0,
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial
            )

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult["model"]
            size = model["size"]
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (max_iter - 1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter),
                )

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2)
                )

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)
                )

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500):  # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1, 64)  # random length headers
                headerName = ''.join(
                    [random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
        ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1

            DATA_HAS_HDR_ROW = random.randint(0, 1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0

            GZIP_DATA = random.randint(0, 1)
            GZIP_HEADER = random.randint(0, 1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','

            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k, v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",":
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)

            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0, 1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0, 1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "

            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER

            # they need to both use the same separator (h2o rule)
            # can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData = SEP_CHAR_GEN.join(hfdList)

            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(
                    trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(
                    csvPathname,
                    rowCount,
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None),
                    rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT,
                    sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(
                        csvPathname, SYNDATASETS_DIR + "/not_used_data_" +
                        csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname

            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(
                trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(
                hdrPathname,
                dataRowsWithHeader,
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None),
                rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT,
                sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER:
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(
                    hdrPathname,
                    SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file']
                                       == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] = 1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] = 1
            else:
                kwargs['header'] = 0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*'
            else:
                pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern,
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header,
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone
                               == 0) and (kwargs['header']
                                          == 1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1

            if 1 == 0:  # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {
                'sample': 100,
                'depth': 25,
                'ntree': 2,
                'ignore': ignoreForRf
            }
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
Example #35
0
    def test_GBM_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            # hex_key = csvFilename + "_" + str(trial) + ".hex"
            hex_key = "C" + str(trial)
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # GBM ****************************************
            if not DO_GBM:
                continue

            # make col 2 a binomial (negative numbers in src
            col = 2
            execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col)
            resultExec = h2o_cmd.runExec(str=execExpr)

            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': col # should be binomial from above
                }

            kwargs = params.copy()
            h2o.beta_features = True
            timeoutSecs = 1800

            start = time.time()
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
            # wait for it to show up in jobs?
            time.sleep(2)
            # no pattern waits for all
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout'])

            print "\nGBMResult:", GBMResult
            # print "\nGBMResult:", h2o.dump_json(GBMResult)

            h2o.beta_features = False
            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_at_all_nodes()
Example #36
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                # double import still causing problems?
                # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                # importFullList = importResult['files']
                # importFailList = importResult['fails']
                # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'family': 'binomial',
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Example #37
0
    def test_c9_GLM_airlines_fvec(self):
        h2o.beta_features = True

        files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda':
                1e-8,
                'alpha':
                0.0,
                'max_iter':
                30,
                'n_folds':
                3,
                'family':
                'binomial',
                'destination_key':
                "GLMKEY",
                'response':
                response,
                'ignored_cols':
                'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i, t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=trainKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing_count, co.domain, sum(
                    co.histogram_bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing_count))
                self.assertEqual(rowCount - expectedNaCnt[i],
                                 sum(co.histogram_bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
Example #39
0
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column

            for i in range(colCount):
                column = summaryResult['summaries'][i]

                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                self.assertEqual(
                    nacnt, expectedNaCnt[i],
                    "Column %s Expected %s. nacnt %s incorrect" %
                    (i, expectedNaCnt[i], nacnt))

                stats = column['stats']
                stattype = stats['type']
                self.assertEqual(stattype, 'Enum')

                # FIX! we should compare mean and sd to expected?
                cardinality = stats['cardinality']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                self.assertEqual(hbrk, [expected[0], expected[1]])

                hcnt = column['hcnt']

                hcntTotal = hcnt[0] + hcnt[1]
                self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i])

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="numRows %s should be %s" %
                                 (numRows, rowCount))

            trial += 1

            h2i.delete_keys_at_all_nodes()
Example #40
0
    def test_c6_hdfs(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # has duplicated col name
            # "hhp2.os.noisy.0_1.data",
            # "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]
        csvFilenameAll = [
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # update: look in the runner*sh ..the find_cloud.py has args now to get it right
        # we have two cdh's though. I guess we're going to use whatever got setup
        # h2o.nodes[0].use_maprfs = False
        # h2o.nodes[0].use_hdfs = True
        # h2o.nodes[0].hdfs_version = 'hdp2.1'
        # h2o.nodes[0].hdfs_name_node = '172.16.2.186'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            print "Loading", csvFilename, 'from hdfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs,
                doSummary=True, benchmarkLogging=benchmarkLogging)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {
                    'ntree': 1
                    }
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000,
                    benchmarkLogging=benchmarkLogging, **kwargs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            if 1==0:
                print "Deleting all keys, to make sure our parse times don't include spills"
                h2i.delete_keys_at_all_nodes()

            trial += 1
Example #41
0
    def test_anomaly_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, COLS, 'x.hex', 1, 20000),
            (ROWS, COLS, 'x.hex', -5000, 0),
            (ROWS, COLS, 'x.hex', -100000, 100000),
            (ROWS, COLS, 'x.hex', -1, 1),
            (ROWS, COLS, 'A.hex', 1, 100),
            (ROWS, COLS, 'A.hex', -99, 99),
            (ROWS, COLS, 'B.hex', 1, 10000),
            (ROWS, COLS, 'B.hex', -100, 100),
            (ROWS, COLS, 'C.hex', 1, 100000),
            (ROWS, COLS, 'C.hex', -101, 101),
        ]

        trial = 1
        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "numRows:", numRows, "numCols:", numCols

            model_key = "m.hex"
            kwargs = {
                'ignored_cols': None,
                'response': numCols - 1,
                'classification': 0,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117',
                'adaptive_rate': 0,
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                # 'loss'                         : 'CrossEntropy',
                'max_w2': 15,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 2.0,
                'destination_key': model_key,
                # 'validation'                   : None,
                'score_interval': 10000,
                'autoencoder': 1,
            }

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "neural net end. took", time.time() - start, "seconds"

            kwargs = {
                'destination_key': "a.hex",
                'source': parseResult['destination_key'],
                'dl_autoencoder_model': model_key,
                'thresh': 1.0
            }

            anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs)
            inspect = h2o_cmd.runInspect(None, "a.hex")
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "anomaly: numRows:", numRows, "numCols:", numCols
            self.assertEqual(numCols, 1)
            # twice as many rows because of NA injection
            self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO))

            # first col has the anomaly info. other cols are the same as orig data
            aSummary = h2o_cmd.runSummary(key='a.hex', cols=0)
            h2o_cmd.infoFromSummary(aSummary)

            print "anomaly:", h2o.dump_json(anomaly)
            trial += 1
            h2i.delete_keys_at_all_nodes()
Example #42
0
    def test_c9_GLM_rc_fvec(self):
        h2o.beta_features = True

        files = [
                 ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800,  None)
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            # avoid printing the coefficient names in jenkins output
            # the last col is the response, so we use a number to point to it below
            parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, 
                header=0, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            response = numCols-1

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 0,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i,t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key=trainKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Example #43
0
    def test_c5_KMeans_sphere_67MB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen_h1m_no_na.csv'
        totalBytes = 67306997
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # clear out all NAs (walk across cols)..clear to 0
            # temp
            ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key)
            ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 10,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
            h2i.delete_keys_at_all_nodes()
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500): # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1,64) # random length headers
                headerName = ''.join([random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
            ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1
            
            DATA_HAS_HDR_ROW = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            
            GZIP_DATA = random.randint(0,1)
            GZIP_HEADER = random.randint(0,1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','


            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k,v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN==" "  or SEP_CHAR_GEN==",": 
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)
            
            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0,1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0,1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "


            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER 

            # they need to both use the same separator (h2o rule)
# can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what 
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            
            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData   = SEP_CHAR_GEN.join(hfdList)

        
            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, 
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname


            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, 
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER: 
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created 
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] =  1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] =  1
            else:
                kwargs['header'] =  0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*'
            else:
                pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            print "parseResult['destination_key']: " + parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header, 
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1
                
            if 1==0: # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf}
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
    def test_KMeans_sphere15_180GB(self):
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = "syn_sphere15_2711545732row_6col_180GB_from_7x.csv"
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        # removed 0's from first col because we set "ignored_cols"
        expected = [
            (
                [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                248846122,
                1308149283316.2988,
            ),
            (
                [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                276924291,
                1800760152555.98,
            ),
            (
                [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                235089554,
                375419158808.3253,
            ),
            (
                [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                166180630,
                525423632323.6474,
            ),
            (
                [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                167234179,
                1845362026223.1094,
            ),
            (
                [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                195420925,
                197941282992.43475,
            ),
            (
                [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                214401768,
                11868360232.658035,
            ),
            (
                [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                258853406,
                598863991074.3276,
            ),
            (
                [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                190979054,
                1505088759456.314,
            ),
            (
                [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                87794427,
                1124697008162.3955,
            ),
            (
                [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                78226988,
                1151439441529.0215,
            ),
            (
                [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                167273589,
                693036940951.0249,
            ),
            (
                [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                148426180,
                35942838893.32379,
            ),
            (
                [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                157533313,
                88431531357.62982,
            ),
            (
                [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                118361306,
                1111537045743.7646,
            ),
        ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "max_iter": 30,
                "k": 15,
                "initialization": "Furthest",
                "cols": None,
                "destination_key": "junk.hex",
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial
            )
            h2i.delete_keys_at_all_nodes()
    def test_c5_KMeans_sphere_26GB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = 'syn_sphere15_gen_26GB.csv'
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                ([
                    -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                    31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                    -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    19.00092954923767, -10.999565572612255, 90.00028669073289,
                    1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                    30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                    16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                    -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                    11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                    -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    147.00394564757505, 122.98729664236723, 311.0047920137008,
                    2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]
        else:
            expected = [
                ([
                    0.0, -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    0.0, 5.0, 2.0, 340.0, 1817.995920197288,
                    33970406.992053084, 31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    0.0, 10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    0.0, 12.0, 3.0, 168.0, -4066.995950679284,
                    41077063.00269915, -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    0.0, 19.00092954923767, -10.999565572612255,
                    90.00028669073289, 1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    0.0, 20.0, 0.0, 141.0, -3263.0030236302937,
                    6163210.990273981, 30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    0.0, 21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    0.0, 39.0, 3.0, 470.0, -3337.9880599007597,
                    28768057.98852736, 16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    0.0, 40.0, 1.0, 145.0, 950.9990795199593,
                    14602680.991458317, -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    0.0, 42.0, 4.0, 479.0, -3678.0033024834297,
                    8209673.001421165, 11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    0.0, 48.0, 4.0, 71.0, -951.0035145455234,
                    49882273.00063991, -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    0.0, 147.00394564757505, 122.98729664236723,
                    311.0047920137008, 2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                         timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'],
                                         numRows=numRows,
                                         numCols=numCols,
                                         timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 500,
                # 'normalize': 1,
                'normalize': 0,  # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                'normalize': 0,
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             benchmarkLogging=benchmarkLogging,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self,
                                               tupleResultList,
                                               expected,
                                               trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=False,
                                                allowRowError=True,
                                                trial=trial)

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult['model']
            size = model['size']
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter))

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2))

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s"
                    % (trial, size, expectedSize))

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
Example #47
0
 def test_G_StoreView(self):
     h2i.delete_keys_at_all_nodes(timeoutSecs=30)
Example #48
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    'destination_key':
                    "GBMKEY",
                    'learn_rate':
                    .2,
                    'nbins':
                    1024,
                    'ntrees':
                    10,
                    'max_depth':
                    depth,
                    'min_rows':
                    10,
                    'response':
                    response,
                    'ignored_cols_by_name':
                    'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                           noPoll=True,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=30,
                                                  retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern="GBMKEY",
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs,
                                 retryDelaySecs=5)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_GBM_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        importFolderPath = "datasets/kmeans_big"
        csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            # hex_key = csvFilename + "_" + str(trial) + ".hex"
            hex_key = "C" + str(trial)
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60,
                                           retryDelaySecs=2,
                                           benchmarkLogging=benchmarkLogging,
                                           **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # GBM ****************************************
            if not DO_GBM:
                continue

            # make col 2 a binomial (negative numbers in src
            col = 2
            execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col,
                                                            hex_key, col)
            resultExec = h2o_cmd.runExec(str=execExpr)

            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': col  # should be binomial from above
            }

            kwargs = params.copy()
            timeoutSecs = 1800

            start = time.time()
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                       noPoll=True,
                                       **kwargs)
            # wait for it to show up in jobs?
            time.sleep(2)
            # no pattern waits for all
            h2o_jobs.pollWaitJobs(pattern=None,
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (
                GBMResult['python_%timeout'])

            print "\nGBMResult:", GBMResult
            # print "\nGBMResult:", h2o.dump_json(GBMResult)

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_at_all_nodes()
    def test_summary2_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, 
                msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount))


            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                
                e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b, e, delta=2*e,
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
Example #51
0
    def test_summary2_uniform_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0,
                                          20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                          -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                       1.00)),
            (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                        100.0)),
            (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                          7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                           100, 00)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                           75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28,
                                           100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               max_qbins=MAX_QBINS,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1 + NA_ROW_RATIO) * rowCount,
                             numRows,
                             msg="numRows %s should be %s" %
                             (numRows, (1 + NA_ROW_RATIO) * rowCount))

            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?

                e = rowCount / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b,
                                       e,
                                       delta=2 * e,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
Example #52
0
    def test_c6_hdfs_fvec(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # has duplicated col name
            # "hhp2.os.noisy.0_1.data",
            # "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]
        csvFilenameAll = [
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # we have two cdh's though. I guess we're going to use whatever got setup
        # h2o.nodes[0].use_maprfs = False
        # h2o.nodes[0].use_hdfs = True
        # h2o.nodes[0].hdfs_version = 'cdh4'
        # h2o.nodes[0].hdfs_name_node = '172.16.2.176'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            print "Loading", csvFilename, 'from hdfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema="hdfs",
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False,
                                           benchmarkLogging=benchmarkLogging)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {'ntrees': 1}
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult,
                                       timeoutSecs=2000,
                                       benchmarkLogging=benchmarkLogging,
                                       **kwargs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF",
                    "trial " + str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            if 1 == 0:
                print "Deleting all keys, to make sure our parse times don't include spills"
                h2i.delete_keys_at_all_nodes()

            trial += 1
Example #53
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # output 378 can't be in this
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 10, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Example #54
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            # double import still causing problems?
            # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            # importFullList = importResult['files']
            # importFailList = importResult['fails']
            # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename +
                                     " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs,
                                           benchmarkLogging=benchmarkLogging)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541
                ]:
                    x.remove(i)
                    ignore_x.append(i)

                # plus 1 because we are no longer 0 offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    'ignored_cols': ignore_x,
                    'family': 'binomial',
                    'response': 'C379',
                    'max_iter': 4,
                    'n_folds': 1,
                    'family': 'binomial',
                    'alpha': 0.2,
                    'lambda': 1e-5
                }

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                aHack = {'destination_key': "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
Example #55
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()

        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = csvFilename + ".hex"
        print "\n" + csvPathname

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=10)
        destination_key = parseResult['destination_key']
        print csvFilename, 'parse time:', parseResult['response']['time']
        print "Parse result['destination_key']:", destination_key

        def inspect_and_check(nodeX,
                              destination_key,
                              offset,
                              view,
                              inspectOld=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX],
                                            destination_key,
                                            offset=offset,
                                            view=view)
            if h2o.beta_features:
                pass
                # print "Inspect2:", h2o.dump_json(inspectNew)
            else:
                pass
                # print "Inspect:", h2o.dump_json(inspectNew)

            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                ('num_cols', 'numCols'),
                ('num_rows', 'numRows'),
                ('value_size_bytes', 'byteSize'),
                ('cols', 'cols'),
            ]

            colNames = [
                ('num_missing_values', 'naCnt'),
            ]

            for (i, j) in constantNames:
                # check the fields, even if you don't have a previous one to compare to
                if h2o.beta_features:
                    # hack in extra info for now, from the new names to old names
                    if not j in inspectNew:
                        raise Exception(
                            "Can't find %s, Inspect2 result should have it?" %
                            j)
                    inspectNew[i] = inspectNew[j]

                # don't compare if cols
                if inspectOld and i != 'cols':
                    if h2o.beta_features and i == 'value_size_bytes':  # Inspect2 should be smaller
                        self.assertGreater(inspectOld[i], inspectNew[i])

                    else:
                        # for cols it will just compare length?
                        self.assertEqual(inspectOld[i], inspectNew[i])

                if i == 'cols':
                    for (m, n) in colNames:
                        if h2o.beta_features:
                            if not n in inspectNew[i][0]:
                                print h2o.dump_json(inspectNew[i][0])
                                raise Exception(
                                    "Can't find %s, Inspect2 result['cols'][0] should have it?"
                                    % n)
                            inspectNew[i][0][m] = inspectNew[i][0][n]
                        # just compare 0
                        if inspectOld is not None:
                            self.assertEqual(inspectOld[i][0][m],
                                             inspectNew[i][0][m])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0, destination_key, 0, 1, None)
        h2o.verboseprint(h2o.dump_json(origInspect))
        origStoreViewResult = h2o_cmd.runStoreView(offset=0,
                                                   view=1024,
                                                   timeoutSecs=60)

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for trial in range(10):
            h2p.green_print("\nTrial", trial)
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0, lenNodes - 1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            h2o.beta_features = False
            inspect_and_check(nodeX, destination_key, offset, view,
                              origInspect)
            print "trying Inspect2 by flipping h2o.nodes[0].beta_features"
            h2o.beta_features = True
            # delay between the two inspects...bug around not getting autoframe in storeview?
            time.sleep(1)
            inspect_and_check(nodeX, destination_key, offset, view,
                              origInspect)
            h2o.beta_features = False

            # a fvec frame should have been created in the storeView
            time.sleep(1)

            # loop looking for the autoframe to show up
            # o = len(origStoreViewResult['keys'])
            o = h2i.count_keys_at_all_nodes()
            retry = 0
            okay = False
            while retry == 0 or not okay:
                newStoreViewResult = h2o_cmd.runStoreView(offset=0,
                                                          view=1024,
                                                          timeoutSecs=60)
                ## p = len(newStoreViewResult['keys'])
                p = h2i.count_keys_at_all_nodes()
                print "number of keys in the two StoreViews, o:", o, "p:", p
                ## print "newStoreViewResult:", h2o.dump_json(newStoreViewResult)
                oOkay = {1, 2, 3, 4, 5, 6, 7, 8}
                pOkay = {1, 2, 3, 4, 5}
                print o, pOkay, p, oOkay
                if (o in oOkay) and (p in pOkay):
                    print "Good"
                    okay = True
                else:
                    print "Unexpected o,p after autoframe, looking at total keys in system: %s %s" % (
                        o, p)

                if retry == 10:
                    raise Exception(
                        "StoreView didn't get autoframe, after %s retries" %
                        retry)
                ## h2b.browseJsonHistoryAsUrlLastMatch("StoreView")

                # so he gets recreated??
                deleted = h2i.delete_keys_at_all_nodes(pattern='autoframe')
                # The autoframe key may not show up!!
                if INVISIBLE_AUTOFRAME:
                    # can be 1 or 2
                    if not (deleted == 0 or deleted == 1):
                        msg = "Should have deleted a total of 0 or 1 keys, looking at all nodes. Did %s" % deleted
                        raise Exception(msg)
                else:
                    # can be 1 or 2
                    if not (deleted == 1):
                        msg = "Should have deleted a total of 1 keys, looking at all nodes. Did %s" % deleted

                time.sleep(1)
                retry += 1
Example #56
0
 def test_G_StoreView(self):
     h2i.delete_keys_at_all_nodes(timeoutSecs=30)
Example #57
0
    def test_c6_maprfs_fvec(self):
        h2o.beta_features = True
        print "\nLoad a list of files from maprfs, parse and do 1 RF tree"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # duplicate column header "A"
            # "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # h2o.nodes[0].use_maprfs = True
        # h2o.nodes[0].use_hdfs = False
        # h2o.nodes[0].hdfs_version = 'mapr3.0.1',
        # h2o.nodes[0].hdfs_name_node = '172.16.2.171:7222'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            (importResult,
             importPattern) = h2i.import_only(path=csvPathname,
                                              schema="maprfs",
                                              timeoutSecs=timeoutSecs)
            print "importResult:", h2o.dump_json(importResult)
            succeeded = importResult['files']
            fails = importResult['fails']

            if len(succeeded) < 1:
                raise Exception("Should have imported at least 1 key for %s" %
                                csvPathname)

            # just do a search
            foundIt = None
            for f in succeeded:
                if csvPathname in f:
                    foundIt = f
                    break

            if not foundIt:
                raise Exception(
                    "Should have found %s in the imported keys for %s" %
                    (importPattern, csvPathname))

            totalBytes = 0

            print "Loading", csvFilename, 'from maprfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema="maprfs",
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=360,
                                           doSummary=True,
                                           benchmarkLogging=benchmarkLogging,
                                           noPoll=h2o.beta_features)
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {'ntrees': 1}
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult,
                                       timeoutSecs=2000,
                                       benchmarkLogging=benchmarkLogging,
                                       noPoll=h2o.beta_features,
                                       **kwargs)
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF",
                    "trial " + str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            print "Deleting all keys, to make sure our parse times don't include spills"
            h2i.delete_keys_at_all_nodes()
            trial += 1