Esempio n. 1
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=200,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Esempio n. 2
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Esempio n. 3
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,
                                       parseKey=parseKey,
                                       timeoutSecs=2000)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
Esempio n. 5
0
    def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(10, 65000, "cH", 30)]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["num_cols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount),
            )
            self.assertEqual(
                inspect["num_rows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["num_rows"], rowCount),
            )

            # we should obey max_column_display
            column_limits = [25, 25000, 50000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(
                    None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs
                )
                self.assertEqual(
                    len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit)
                )
                for r in range(0, len(inspect["rows"])):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(
                        len(inspect["rows"][r]),
                        column_limit + 1,
                        "inspect data rows obeys max_column_display = " + str(column_limit),
                    )
    def test_hosts_with_a_browser(self):
        h2b.browseTheCloud()

        # hang for many hour, so you can play with the browser
        # FIX!, should be able to do something that waits till browser is quit?
        if not h2o.browse_disable:
            time.sleep(500000)
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
Esempio n. 8
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
Esempio n. 9
0
    def test_hosts_with_a_browser(self):
        h2b.browseTheCloud()

        # hang for many hour, so you can play with the browser
        # FIX!, should be able to do something that waits till browser is quit?
        if not h2o.browse_disable:
            time.sleep(500000)
Esempio n. 10
0
    def test_elapsed_time(self):
        h2b.browseTheCloud()

        print "The reported time should increment for each node, on every node."

        for n in range(NODE_NUM):
            c = h2o.nodes[n].get_cloud()
            self.assertEqual(c['cloud_healthy'], True)
            # the node order doesn't match our node order
            
        # start with elapsed_time history = 0
        etime = [ 0 for i in range(NODE_NUM)]

        # loop checking delapsed time increments
        def check_and_update_etime():
            for n in range(NODE_NUM):
                c = h2o.nodes[n].get_cloud()
                for i in range(NODE_NUM):
                    t = c['nodes'][i]['elapsed_time']
                    n = c['nodes'][i]['name']
                    h = c['nodes'][i]['node_healthy']
                    print "Current elapsed_time: %s for %s" % (t, n)
                    if t < etime[i]:
                        msg="Current elapsed_time: %s at %s is not > its last polled elapsed_time %s" % (t, n, etime[i])

                    etime[i] = t
                    self.assertEqual(h, True)

        for j in range(10):
            time.sleep(2)
            check_and_update_etime()
Esempio n. 11
0
    def test_B_putfile_files(self):
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameList = [
            ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, csvPathname, trees) in csvFilenameList:
            parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, timeoutSecs=500, schema='put')
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            # We should be able to see the parse result?
            inspect2 = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 12
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        for csvFilename in csvFilenameList:
            csvPathname = "datasets/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           timeoutSecs=1000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "parse result:", parseResult['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRF(trees=1,
                                   parseResult=parseResult,
                                   timeoutSecs=2000)
    def test_exec_import_hosts_bigfiles(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 4000

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        # Update: need unique key names apparently. can't overwrite prior parse output key?
        # replicating lines means they'll get reparsed. good! (but give new key names)

        csvFilenameList = [
            ("covtype.data", "c"),
            ("covtype20x.data", "c20"),
            ("covtype200x.data", "c200"),
            ("billion_rows.csv.gz", "b"),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            exec_list(exprList, lenNodes, csvFilename, key2)
    def test_exec_import_hosts_bigfiles(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 4000

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        # Update: need unique key names apparently. can't overwrite prior parse output key?
        # replicating lines means they'll get reparsed. good! (but give new key names)

        csvFilenameList = [
            ("covtype.data", "c"),
            ("covtype20x.data", "c20"),
            ("covtype200x.data", "c200"),
            ("billion_rows.csv.gz", "b"),
            ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, 
                csvFilename, importFolderPath, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            exec_list(exprList, lenNodes, csvFilename, key2)
Esempio n. 15
0
    def test_RF_poker_311M(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)

        csvFilename = 'new-poker-hand.full.311M.txt.gz'
        for trials in range(2):
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=5,
                                       depth=5,
                                       parseKey=parseKey,
                                       timeoutSecs=600,
                                       retryDelaySecs=10.0)
            print "RF end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
Esempio n. 16
0
 def test_1(self):
     h2b.browseTheCloud()
     csvFilename = "airlines_all.csv"
     csvPathname = 'airlines/airlines_all.csv'
     h2o.beta_features = True
     hex_key = csvFilename + ".hex"
     start = time.time()
     timeoutSecs = 1200
     # airlines_hex = h2i.import_parse(bucket='/home/0xdiag/datasets', path=csvPathname, schema='local', hex_key=hex_key,
     #             timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60, doSummary=False)
     # print "fv.parse done in ",(time.time()-start)
     # kwargs = {
     #     'ignored_cols':'DepTime,ArrTime,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
     #     'standardize': 1,
     #     'classification': 1,
     #     'response': 'IsDepDelayed',
     #     'family': 'binomial',
     #     'n_folds': 0,
     #     'max_iter': 50,
     #     'beta_epsilon': 1e-4,
     #     'lambda':1e-5
     # }
     # results = []
     # for i in range(5):
     #     start = time.time()
     #     glm = h2o_cmd.runGLM(parseResult=airlines_hex, timeoutSecs=timeoutSecs, **kwargs)
     #     auc = glm['glm_model']['submodels'][0]['validation']['auc']
     #     results.append('glm2(%d) done in %d,auc=%f' %(i,(time.time()-start),auc))
     # for s in results:
     #     print s
     while 1:
         time.sleep(500000)
         print '.'
Esempio n. 17
0
    def test_elapsed_time(self):
        h2b.browseTheCloud()

        print "The reported time should increment for each node, on every node."

        for n in range(NODE_NUM):
            c = h2o.nodes[n].get_cloud()
            self.assertEqual(c['cloud_healthy'], True)
            # the node order doesn't match our node order

        # start with elapsed_time history = 0
        etime = [0 for i in range(NODE_NUM)]

        # loop checking delapsed time increments
        def check_and_update_etime():
            for n in range(NODE_NUM):
                c = h2o.nodes[n].get_cloud()
                for i in range(NODE_NUM):
                    t = c['nodes'][i]['elapsed_time']
                    n = c['nodes'][i]['name']
                    h = c['nodes'][i]['node_healthy']
                    print "Current elapsed_time: %s for %s" % (t, n)
                    if t < etime[i]:
                        msg = "Current elapsed_time: %s at %s is not > its last polled elapsed_time %s" % (
                            t, n, etime[i])

                    etime[i] = t
                    self.assertEqual(h, True)

        for j in range(10):
            time.sleep(2)
            check_and_update_etime()
Esempio n. 18
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = csvFilename

            # creates csvFilename and csvFilename.hex  keys
            parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 19
0
 def test_1(self):
     h2b.browseTheCloud()
     csvFilename = "airlines_all.csv"
     csvPathname='airlines/airlines_all.csv'
     hex_key = csvFilename + ".hex"
     start = time.time()
     timeoutSecs=1200
     # airlines_hex = h2i.import_parse(bucket='/home/0xdiag/datasets', path=csvPathname, schema='local', hex_key=hex_key, 
     #             timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60, doSummary=False)
     # print "fv.parse done in ",(time.time()-start)
     # kwargs = {
     #     'ignored_cols':'DepTime,ArrTime,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
     #     'standardize': 1,
     #     'classification': 1,
     #     'response': 'IsDepDelayed',
     #     'family': 'binomial',
     #     'n_folds': 0,
     #     'max_iter': 50,
     #     'beta_epsilon': 1e-4,
     #     'lambda':1e-5
     # }
     # results = []
     # for i in range(5):
     #     start = time.time()
     #     glm = h2o_cmd.runGLM(parseResult=airlines_hex, timeoutSecs=timeoutSecs, **kwargs)
     #     auc = glm['glm_model']['submodels'][0]['validation']['auc']
     #     results.append('glm2(%d) done in %d,auc=%f' %(i,(time.time()-start),auc))
     # for s in results:
     #     print s
     while 1:
       time.sleep(500000)
       print '.'
Esempio n. 20
0
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (10, 100000, 'cA', 200, 200),
            (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse:", parseResult['destination_key'], "took", time.time(
            ) - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            # if not h2o.browse_disable:
            #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            #    time.sleep(5)
            h2i.delete_keys_at_all_nodes()
Esempio n. 21
0
    def test_dead_node_status(self):
        # view logs using each node
        h2b.browseTheCloud()

        for h in h2o.nodes:
            h.log_view()

        # terminate node 1
        h2o.nodes[1].terminate_self_only()
        # remember which is [1] so we can check cloud state correctly
        badPort = "/" + str(h2o.nodes[1].http_addr) + ":" + str(h2o.nodes[1].port)

        nodeList = h2o.nodes[:] # copy
        del nodeList[1] # 1 is dead now
        print "We probably need some status to interrogate to understand a node is in red state?"
        print "And I probably need to wait 60 secs to get to red state"
        time.sleep(120)
        # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True)
        # time.sleep(5)
        # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True)
        # time.sleep(5)
        # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True)

        # just check that node_healthy' goes 'false' on that node
        # and 'cloud_healthy' goes false
        
        # everyone should see the same stuff (0 and 2, 1 won't respond)
        for n in (0,2):
            c = h2o.nodes[n].get_cloud()
            # the node order doesn't match our node order
            for i in range(3):
                expected = c['nodes'][i]['name']!=badPort
                self.assertEqual(c['nodes'][i]['node_healthy'], expected)

            self.assertEqual(c['cloud_healthy'], False, msg="node %s shouldn't think the cloud is healthy: %s" % (n, c['cloud_healthy']))
Esempio n. 22
0
    def test_rf_kddcup_1999(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        csvFilename = 'kddcup_1999.data.gz'

        print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\
            "compared to running with the parameters specified and matching the browser RF query defaults. " +\
            "Also run the param for full scoring vs OOBE scoring."

        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             timeoutSecs=300)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        for trials in range(4):
            print "\n" + csvFilename, "Trial #", trials
            start = time.time()

            kwargs = {
                'response_variable': 'classifier',
                'ntree': 200,
                'gini': 1,
                'class_weights': None,
                'stratify': 0,
                # 'features': None,
                'features': 7,
                'ignore': None,
                'sample': 67,
                'bin_limit': 1024,
                'depth': 2147483647,
                'seed': 784834182943470027,
                'parallel': 1,
                'exclusive_split_limit': None,
            }

            if trials == 0:
                kwargs = {}
            elif trials == 1:
                kwargs['out_of_bag_error_estimate'] = None
            elif trials == 2:
                kwargs['out_of_bag_error_estimate'] = 0
            elif trials == 3:
                kwargs['out_of_bag_error_estimate'] = 1

            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=50,
                                       parseKey=parseKey,
                                       timeoutSecs=300,
                                       retryDelaySecs=1.0,
                                       **kwargs)
            print "RF end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
Esempio n. 23
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            # these can't RF ..output classes not integer?
            # "bestbuy_test.csv",
            # "bestbuy_train.csv",
            "covtype.data",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            # "prostate_2g.csv",
            # "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            # "poker_c1s1_testing_refresh.csv",
            # "3G_poker_shuffle",
            # "billion_rows.csv.gz",
            # "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
Esempio n. 24
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
Esempio n. 25
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = csvFilename

            # creates csvFilename and csvFilename.hex  keys
            parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRF(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 26
0
    def test_from_import_fvec(self):

        print "Sets h2o.beat_features like -bf at command line"
        print "this will redirect import and parse to the 2 variants"
        h2o.beta_features = True  # this will redirect import and parse to the 2 variants

        importFolderPath = '/home/0xdiag/datasets/standard'
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=500)
            if not h2o.beta_features:
                print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'],
                                         timeoutSecs=30)

            if not h2o.beta_features:
                RFview = h2o_cmd.runRFOnly(trees=1,
                                           depth=25,
                                           parseKey=parseKey,
                                           timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            # FIX! currently the importFolderResult is empty for fvec
            if 1 == 0:
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 27
0
    def test_sum_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        #    ("covtype20x.data", "cD", 50, 20),
        #    ("covtype200x.data", "cE", 50, 200),
        csvFilenameAll = [
            ("covtype.data", "cA", 5, 1),
            ("covtype.data", "cB", 5, 1),
            ("covtype.data", "cC", 5, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                key2,
                minCol=0,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Esempio n. 28
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2, java_heap_GB=10, use_flatfile=True)
     else:
         import h2o_hosts
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
Esempio n. 29
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True)
     else:
         import h2o_hosts
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
Esempio n. 30
0
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if localhost:
         h2o.build_cloud(1)
     else:
         h2o_hosts.build_cloud_with_hosts(1)
     h2b.browseTheCloud()
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2, java_heap_GB=7)
     else:
         h2o_hosts.build_cloud_with_hosts(java_heap_GB=10)
     h2b.browseTheCloud()
Esempio n. 32
0
    def test_GLM2_tnc3_10(self):
        h2o.beta_features = True
        csvFilename = 'tnc3_10.csv'
        print "\n" + csvFilename
        hex_key = "tnc3.hex"
        h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Esempio n. 33
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3_10.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseKey
            # in any case, the destination_key in parseKey was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)

            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 35
0
    def test_GLM_hdfs_YearPredictionMSD(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            h2i.setupImportHdfs()
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 36
0
    def setUpClass(cls):
        # fails with 3
        localhost = h2o.decide_if_localhost()
        if (localhost):
            h2o.build_cloud(3, java_heap_GB=4, use_flatfile=True)
        else:
            h2o_hosts.build_cloud_with_hosts()

        h2b.browseTheCloud()
Esempio n. 37
0
    def test_B_importFolder_files(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            # quick test first
            "covtype.data", 
            # then the real thing
            "billion_rows.csv.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, pollTimeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25

            # RF seems to get memory allocation errors on single machine (16GB dram)
            ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs)
            ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

            # now some GLm
            kwargs = {'x': 0, 'y': 1, 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)

            sys.stdout.write('\n.')
            sys.stdout.flush() 
Esempio n. 38
0
 def setUpClass(cls):
     global SEED, localhost
     SEED = h2o.setup_random_seed()
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2,java_heap_GB=4,use_flatfile=True)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
    def test_parse_500_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 500, 'cA', 1800, 1800),
        ]

        h2b.browseTheCloud()
        for (rowCount, colCount, orig_hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # create sym links
            multifile = 1000
            # there is already one file. assume it's the "0" case
            for p in range(1, multifile):
                csvPathnameLink = csvPathname + "_" + str(p)
                os.symlink(csvFilename, csvPathnameLink)

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            for trial in range(10):
                hex_key = orig_hex_key + str(trial)
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname + "*",
                                               schema='local',
                                               hex_key=hex_key,
                                               delete_on_done=1,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
                print "Parse:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"

                start = time.time()
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=timeoutSecs2)
                print "Inspect:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"
                h2o_cmd.infoFromInspect(inspect, csvPathname)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])

                # should match # of cols in header or ??
                self.assertEqual(
                    inspect['numCols'], colCount,
                    "parse created result with the wrong number of cols %s %s"
                    % (inspect['numCols'], colCount))
                self.assertEqual(inspect['numRows'], rowCount * multifile,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (inspect['numRows'], rowCount * multifile))
Esempio n. 40
0
 def setUpClass(cls):
     # fails with 3
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(1)
     else:
         h2o_hosts.build_cloud_with_hosts(1)
     h2b.browseTheCloud()
Esempio n. 41
0
    def setUpClass(cls):
        # fails with 3
        localhost = h2o.decide_if_localhost()
        if (localhost):
            h2o.build_cloud(3,java_heap_GB=4,use_flatfile=True)
        else:
            h2o_hosts.build_cloud_with_hosts()

        h2b.browseTheCloud()
Esempio n. 42
0
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b

        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList:
        nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes.
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
Esempio n. 43
0
 def setUpClass(cls):
     global SEED, localhost
     SEED = h2o.setup_random_seed()
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2, java_heap_GB=4, use_flatfile=True)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
Esempio n. 44
0
    def test_sum_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype.data", "cB", 5,  1),
                ("covtype.data", "cC", 5,  1),
            ]
        else:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype20x.data", "cD", 50, 20),
                ("covtype200x.data", "cE", 50, 200),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\n*************"
            print "colResultList", colResultList
            print "*************"

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
Esempio n. 45
0
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey,
                timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 46
0
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey,
                timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 47
0
    def test_cols_enum_multi_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
        ]

        h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED,
                                  translateList)

        for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            h2o.nodes[0].import_files(SYNDATASETS_DIR)
            # pattern match all, then use exclude
            parseKey = h2o.nodes[0].parse('*',
                                          key2=key2,
                                          exclude=excludePattern,
                                          header=1,
                                          timeoutSecs=timeoutSecs)
            print "parseKey['destination_key']: " + parseKey['destination_key']
            print 'parse time:', parseKey['response']['time']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            print "\n" + parseKey['destination_key'] + ":", \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (num_rows, rowCount, FILENUM)))
Esempio n. 48
0
    def setUpClass(cls):
        # fails with 3
        global local_host
        local_host = not 'hosts' in os.getcwd()
        if (local_host):
            h2o.build_cloud(3,java_heap_GB=4,use_flatfile=True)
        else:
            h2o_hosts.build_cloud_with_hosts()

        h2b.browseTheCloud()
Esempio n. 49
0
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=1)
     else:
         h2o_hosts.build_cloud_with_hosts(node_count=1)
     global SYNDATASETS_DIR
     SYNDATASETS_DIR = h2o.make_syn_dir()
     h2b.browseTheCloud()
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=1) 
     else:
         h2o_hosts.build_cloud_with_hosts(node_count=1) 
     global SYNDATASETS_DIR
     SYNDATASETS_DIR = h2o.make_syn_dir()
     h2b.browseTheCloud()
Esempio n. 51
0
    def test_rf_predict_fvec(self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        h2o.beta_features = True
        trees = 6
        timeoutSecs = 20
        hex_key = "iris2.csv.hex"
        parseResult = h2i.import_parse(bucket="smalldata", path="iris/iris2.csv", schema="put", hex_key=hex_key)
        h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, destination_key="iris_rf_model", timeoutSecs=timeoutSecs)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key. Inspect/Summary result"

        start = time.time()
        predict = h2o.nodes[0].generate_predictions(
            model_key="iris_rf_model", data_key=hex_key, prediction="predict.hex"
        )
        print "generate_predictions end on ", hex_key, " took", time.time() - start, "seconds"
        print "predict:", h2o.dump_json(predict)
        csvPredictPathname = SYNDATASETS_DIR + "/" + "iris2.predict.csv"
        h2o.nodes[0].csv_download(src_key="predict.hex", csvPathname=csvPredictPathname)

        inspect = h2o_cmd.runInspect(key="predict.hex")
        print "inspect:", h2o.dump_json(inspect)

        # print h2o.dump_json(predict)
        # no min/max any more with enums?

        expectedCols = {
            # "max": 2.0,
            # "mean": 1.0,
            # "min": 0.0,
            "naCnt": 0,
            # "name": 0,
            # Enum or real?
            # "type": "Real",
        }

        predictCols = inspect["cols"][0]
        diffKeys = [k for k in expectedCols if predictCols[k] != expectedCols[k]]
        for k in diffKeys:
            raise Exception(
                "Checking H2O summary results, wrong %s: %s, should be: %s" % (k, predictCols[k], expectedCols[k])
            )

        expected = {
            "numRows": 150,
            "numCols": 4,
            # "byteSize": 2843,
        }

        diffKeys = [k for k in expected if inspect[k] != expected[k]]
        print "diffKeys", diffKeys
        for k in diffKeys:
            raise Exception("%s : %s != %s" % (k, inspect[k], expected[k]))
Esempio n. 52
0
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False):
    if h2o_args.sleep_at_tear_down:
        print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)"
        import h2o_browse as h2b
        h2b.browseTheCloud()
        sleep(3600)

    if not nodeList: nodeList = h2o_nodes.nodes

    # this could fail too. Should this be set by -uc/--usecloud? or command line argument
    if nodeList and nodeList[0].delete_keys_at_teardown:
        start = time.time()
        h2i.delete_keys_at_all_nodes(timeoutSecs=300)
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"

    # could the nodeList still be empty in some exception cases? Assume not for now

    # FIX! don't send shutdown if we're using an existing cloud
    # also, copy the "delete keys at teardown from testdir_release
    # Assume there's a last "test" that's run to shutdown the cloud

    # don't tear down with -ccj either
    # FIX! what about usecloud or cloud_cloud_json params from build_cloud time?
    if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json):
        try:
            # update: send a shutdown to all nodes. 
            # h2o maybe doesn't progagate well if sent to one node
            # the api watchdog shouldn't complain about this?
            # just send one?

            # for n in nodeList:
            #     n.shutdown_all()
            h2o_nodes.nodes[0].shutdown_all()
        except:
            pass

        # ah subtle. we might get excepts in issuing the shutdown, don't abort out
        # of trying the process kills if we get any shutdown exception (remember we go to all nodes)
        # so we might? nodes are shutting down?
        # FIX! should we wait a bit for a clean shutdown, before we process kill?
        # It can take more than 1 sec though.
        try:
            time.sleep(2)
            for n in nodeList:
                n.terminate()
                verboseprint("tear_down_cloud n:", n)
        except:
            pass

    check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name)
    # get rid of all those pesky line marker files. Unneeded now
    clean_sandbox_doneToLine()
    nodeList[:] = []
    h2o_nodes.nodes = []
Esempio n. 53
0
    def test_GLM_catdata_hosts(self):
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validations1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = h2o.find_file('smalldata/' + csvFilename)
            # I use this if i want the larger set in my localdir
            # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename)

            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {
                'y': 7,
                'x': '1,2,3,4,5,6',
                'family': "binomial",
                'n_folds': 3,
                'lambda': 1e-4
            }
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(csvPathname=csvPathname,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs)

            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            validationsList = glm['GLMModel']['validations']
            print validationsList
            validations = validationsList[0]

            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 54
0
 def test_hdfs_multi_bad_csv(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2o.nodes[0].parse('*airlines_all*csv', key2='random_csv.hex', 
         exclude=None, header=None, timeoutSecs=600)
     print "*csv* regex to hdfs /datasets", 'parse time:', parseKey['response']['time']
     print "parse result:", parseKey['destination_key']
     sys.stdout.flush() 
Esempio n. 55
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                hex_key,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Esempio n. 56
0
    def test_multi_with_a_browser(self):
        h2b.browseTheCloud()
        # csvPathname = '../smalldata/poker/poker1000'
        # h2o_cmd.runRF(trees=10000, timeoutSecs=300, csvPathname=csvPathname)
        # h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        # browseJsonHistoryAsUrl()

        # hang for many hour, so you can play with the browser
        # FIX!, should be able to do something that waits till browser is quit?
        if not h2o.browse_disable:
            time.sleep(500000)
Esempio n. 57
0
def check_cloud_and_setup_next():
    h2b.browseTheCloud()
    h2o.verify_cloud_size()
    h2o.check_sandbox_for_errors()
    print "Tearing down cloud of size", len(h2o.nodes)
    h2o.tear_down_cloud()
    h2o.clean_sandbox()
    # wait to make sure no sticky ports or anything os-related
    # so let's expand the delay if larger number of jvms
    # 1 second per node seems good
    h2o.verboseprint("Waiting", node_count, "seconds to avoid OS sticky port problem")
    time.sleep(node_count)
    def test_multi_with_a_browser(self):
        h2b.browseTheCloud()
        # csvPathname = '../smalldata/poker/poker1000'
        # h2o_cmd.runRF(trees=10000, timeoutSecs=300, csvPathname=csvPathname)
        # h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        # browseJsonHistoryAsUrl()

        # hang for many hour, so you can play with the browser
        # FIX!, should be able to do something that waits till browser is quit?
        if not h2o.browse_disable:
            time.sleep(500000)
Esempio n. 59
0
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cB", 15),
                ("covtype20x.data", "cD", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            cnum += 1
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            # we use factor in this test...so timeout has to be bigger!
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=maxTrials,
                                    timeoutSecs=(timeoutSecs))
Esempio n. 60
0
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()

        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=900,
                                       doSummary=False)
        parse_time = time.time() - start
        print "parse took {0} sec".format(parse_time)
        start = time.time()

        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = {
            'ntrees': trees,
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name':
            'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
        }
        kwargs = paramsTrainRF.copy()
        start = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename,
            elapsed, trees, classification_error, classErrorPctList,
            totalScores)
        print "\n" + l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)